4 examples of 'pyspark iterate over dataframe' in Python

Every line of 'pyspark iterate over dataframe' code snippets is scanned for vulnerabilities by our powerful machine learning engine that combs millions of open source libraries, ensuring your Python code is secure.

All examples are scanned by Snyk Code

By copying the Snyk Code Snippets you agree to
250def _transform(self, dataset):
251 self._transfer_params_to_java()
252 return DataFrame(self._java_obj.transform(dataset._jdf), dataset.sql_ctx)
331def transform(self, dstreams, transformFunc):
332 """
333 Create a new DStream in which each RDD is generated by applying
334 a function on RDDs of the DStreams. The order of the JavaRDDs in
335 the transform function parameter will be the same as the order
336 of corresponding DStreams in the list.
337 """
338 jdstreams = [d._jdstream for d in dstreams]
339 # change the final serializer to sc.serializer
340 func = TransformFunction(self._sc,
341 lambda t, *rdds: transformFunc(rdds).map(lambda x: x),
342 *[d._jrdd_deserializer for d in dstreams])
343 jfunc = self._jvm.TransformFunction(func)
344 jdstream = self._jssc.transform(jdstreams, jfunc)
345 return DStream(jdstream, self, self._sc.serializer)
6def main():
7 '''Program entry point'''
8
9 #Intialize a spark context
10 with pyspark.SparkContext("local", "PySparkWordCount") as sc:
11 #Get a RDD containing lines from this script file
12 lines = sc.textFile(__file__)
13 #Split each line into words and assign a frequency of 1 to each word
14 words = lines.flatMap(lambda line: line.split(" ")).map(lambda word: (word, 1))
15 #count the frequency for words
16 counts = words.reduceByKey(operator.add)
17 #Sort the counts in descending order based on the word frequency
18 sorted_counts = counts.sortBy(lambda x: x[1], False)
19 #Get an iterator over the counts to print a word and its frequency
20 for word,count in sorted_counts.toLocalIterator():
21 print(u"{} --> {}".format(word, count))
56def test_groupBy():
57 my_rdd = pysparkling.Context().parallelize([4, 7, 2])
58 grouped = my_rdd.groupBy(lambda x: x % 2).collect()
59 assert grouped[0][1][1] == 2

Related snippets