6 examples of 'convert pandas dataframe to spark dataframe' in Python

Every line of 'convert pandas dataframe to spark dataframe' code snippets is scanned for vulnerabilities by our powerful machine learning engine that combs millions of open source libraries, ensuring your Python code is secure.

All examples are scanned by Snyk Code

By copying the Snyk Code Snippets you agree to
250def _transform(self, dataset):
251 self._transfer_params_to_java()
252 return DataFrame(self._java_obj.transform(dataset._jdf), dataset.sql_ctx)
36def csvToDataFrame(sqlCtx,rdd,columns=None,sep=",",parseDate=True, nSampl=1000):
37 def toRow(line):
38 return toRowSep(line,sep)
39 rdd_array = rdd.map(toRow)
40 rdd_sql = rdd_array
41 if columns is None:
42 columns = rdd_array.first()
43 rdd_sampl = rdd_array.zipWithIndex().filter(lambda (r,i): (i > 0 and ((nSampl == 0) or (i < nSampl)))).keys()
44 rdd_sql = rdd_array.zipWithIndex().filter(lambda (r,i): i > 0).keys()
45 column_types = evaluateType(rdd_sampl,parseDate)
46 def toSqlRow(row):
47 return toSqlRowWithType(row,column_types)
48 schema = makeSchema(zip(columns,column_types))
49 return sqlCtx.createDataFrame(rdd_sql.map(toSqlRow), schema=schema)
291def from_pandas_rdd(self, pandas_rdd):
292 """Create a Sparkling Pandas DataFrame from the provided RDD
293 which is comprised of Panda's DataFrame. Note: the current version
294 drops index information.
295 Parameters
296 ----------
297 pandas_rdd: RDD[pandas.DataFrame]
298 Returns
299 -------
300 Sparkling Pandas DataFrame."""
301 return DataFrame.fromDataFrameRDD(pandas_rdd, self.sql_ctx)
66def spark_read_from_jdbc(spark, url, user, password, metastore_table, jdbc_table, driver,
67 save_mode, save_format, fetch_size, num_partitions,
68 partition_column, lower_bound, upper_bound):
69
70 # first set common options
71 reader = set_common_options(spark.read, url, jdbc_table, user, password, driver)
72
73 # now set specific read options
74 if fetch_size:
75 reader = reader.option('fetchsize', fetch_size)
76 if num_partitions:
77 reader = reader.option('numPartitions', num_partitions)
78 if partition_column and lower_bound and upper_bound:
79 reader = reader \
80 .option('partitionColumn', partition_column) \
81 .option('lowerBound', lower_bound) \
82 .option('upperBound', upper_bound)
83
84 reader \
85 .load() \
86 .write \
87 .saveAsTable(metastore_table, format=save_format, mode=save_mode)
35def csvToDataFrame(self, sqlCtx, rdd, columns=None, sep=",", parseDate=True):
36 """Converts CSV plain text RDD into SparkSQL DataFrame (former SchemaRDD)
37 using PySpark. If columns not given, assumes first row is the header.
38 If separator not given, assumes comma separated
39 """
40 if self.py_version < 3:
41 def toRow(line):
42 return self.toRowSep(line.encode('utf-8'), sep)
43 else:
44 def toRow(line):
45 return self.toRowSep(line, sep)
46
47 rdd_array = rdd.map(toRow)
48 rdd_sql = rdd_array
49
50 if columns is None:
51 columns = rdd_array.first()
52 rdd_sql = rdd_array.zipWithIndex().filter(
53 lambda r_i: r_i[1] > 0).keys()
54 column_types = self.evaluateType(rdd_sql, parseDate)
55
56 def toSqlRow(row):
57 return self.toSqlRowWithType(row, column_types)
58
59 schema = self.makeSchema(zip(columns, column_types))
60
61 return sqlCtx.createDataFrame(rdd_sql.map(toSqlRow), schema=schema)
410def _sparksql(self, jvm, converter):
411 return converter.CentrallyBin([c for c, v in self.bins], self.quantity.asSparkSQL(), self.bins[0][1]._sparksql(jvm, converter), self.nanflow._sparksql(jvm, converter))

Related snippets