3 examples of 'pyspark loop through dataframe' in Python

Every line of 'pyspark loop through dataframe' code snippets is scanned for vulnerabilities by our powerful machine learning engine that combs millions of open source libraries, ensuring your Python code is secure.

All examples are scanned by Snyk Code

By copying the Snyk Code Snippets you agree to
6def main():
7 '''Program entry point'''
8
9 #Intialize a spark context
10 with pyspark.SparkContext("local", "PySparkWordCount") as sc:
11 #Get a RDD containing lines from this script file
12 lines = sc.textFile(__file__)
13 #Split each line into words and assign a frequency of 1 to each word
14 words = lines.flatMap(lambda line: line.split(" ")).map(lambda word: (word, 1))
15 #count the frequency for words
16 counts = words.reduceByKey(operator.add)
17 #Sort the counts in descending order based on the word frequency
18 sorted_counts = counts.sortBy(lambda x: x[1], False)
19 #Get an iterator over the counts to print a word and its frequency
20 for word,count in sorted_counts.toLocalIterator():
21 print(u"{} --> {}".format(word, count))
250def _transform(self, dataset):
251 self._transfer_params_to_java()
252 return DataFrame(self._java_obj.transform(dataset._jdf), dataset.sql_ctx)
23def main(argv):
24
25 # Name of prediction column
26 label = argv[1]
27
28 start = time.time()
29
30 spark = SparkSession.builder \
31 .master("local[*]") \
32 .appName("datasetRegressor") \
33 .getOrCreate()
34
35 data = spark.read.parquet(argv[0]).cache()
36
37 vector = data.first()
38 print(vector)
39 featureCount = len(vector)
40 print("Feature count : {featureCount}")
41
42 print("Dataset size (unbalanced) : {data.count()}")
43
44 testFraction = 0.3
45 seed = 123
46
47 # Linear Regression
48 lr = LinearRegression().setLabelCol(label) \
49 .setFeaturesCol("features")
50 reg = SparkRegressor(lr, label, testFraction, seed)
51 matrics = reg.fit(data)
52 for k,v in matrics.items(): print(f"{k}\t{v}")
53
54 # GBTRegressor
55 gbt = GBTRegressor().setLabelCol(label) \
56 .setFeaturesCol("features")
57 reg = SparkRegressor(gbt, label, testFraction, seed)
58 matrics = reg.fit(data)
59 for k,v in matrics.items(): print(f"{k}\t{v}")
60
61 # GeneralizedLinearRegression
62 glr = GeneralizedLinearRegression().setLabelCol(label) \
63 .setFeaturesCol("features") \
64 .setFamily("gaussian") \
65 .setLink("identity") \
66 .setMaxIter(10) \
67 .setRegParam(0.3)
68 reg = SparkRegressor(glr, label, testFraction, seed)
69 matrics = reg.fit(data)
70 for k,v in matrics.items(): print(f"{k}\t{v}")
71
72 end = time.time()
73 print("Time: %f sec." %(end-start))

Related snippets