Every line of 'pyspark loop through dataframe' code snippets is scanned for vulnerabilities by our powerful machine learning engine that combs millions of open source libraries, ensuring your Python code is secure.
6 def main(): 7 '''Program entry point''' 8 9 #Intialize a spark context 10 with pyspark.SparkContext("local", "PySparkWordCount") as sc: 11 #Get a RDD containing lines from this script file 12 lines = sc.textFile(__file__) 13 #Split each line into words and assign a frequency of 1 to each word 14 words = lines.flatMap(lambda line: line.split(" ")).map(lambda word: (word, 1)) 15 #count the frequency for words 16 counts = words.reduceByKey(operator.add) 17 #Sort the counts in descending order based on the word frequency 18 sorted_counts = counts.sortBy(lambda x: x[1], False) 19 #Get an iterator over the counts to print a word and its frequency 20 for word,count in sorted_counts.toLocalIterator(): 21 print(u"{} --> {}".format(word, count))
250 def _transform(self, dataset): 251 self._transfer_params_to_java() 252 return DataFrame(self._java_obj.transform(dataset._jdf), dataset.sql_ctx)
23 def main(argv): 24 25 # Name of prediction column 26 label = argv[1] 27 28 start = time.time() 29 30 spark = SparkSession.builder \ 31 .master("local[*]") \ 32 .appName("datasetRegressor") \ 33 .getOrCreate() 34 35 data = spark.read.parquet(argv[0]).cache() 36 37 vector = data.first() 38 print(vector) 39 featureCount = len(vector) 40 print("Feature count : {featureCount}") 41 42 print("Dataset size (unbalanced) : {data.count()}") 43 44 testFraction = 0.3 45 seed = 123 46 47 # Linear Regression 48 lr = LinearRegression().setLabelCol(label) \ 49 .setFeaturesCol("features") 50 reg = SparkRegressor(lr, label, testFraction, seed) 51 matrics = reg.fit(data) 52 for k,v in matrics.items(): print(f"{k}\t{v}") 53 54 # GBTRegressor 55 gbt = GBTRegressor().setLabelCol(label) \ 56 .setFeaturesCol("features") 57 reg = SparkRegressor(gbt, label, testFraction, seed) 58 matrics = reg.fit(data) 59 for k,v in matrics.items(): print(f"{k}\t{v}") 60 61 # GeneralizedLinearRegression 62 glr = GeneralizedLinearRegression().setLabelCol(label) \ 63 .setFeaturesCol("features") \ 64 .setFamily("gaussian") \ 65 .setLink("identity") \ 66 .setMaxIter(10) \ 67 .setRegParam(0.3) 68 reg = SparkRegressor(glr, label, testFraction, seed) 69 matrics = reg.fit(data) 70 for k,v in matrics.items(): print(f"{k}\t{v}") 71 72 end = time.time() 73 print("Time: %f sec." %(end-start))