forked from spark-examples/pyspark-examples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
pyspark-row.py
65 lines (53 loc) · 1.71 KB
/
pyspark-row.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
from pyspark.sql import SparkSession, Row
row=Row("James",40)
print(row[0] +","+str(row[1]))
row2=Row(name="Alice", age=11)
print(row2.name)
Person = Row("name", "age")
p1=Person("James", 40)
p2=Person("Alice", 35)
print(p1.name +","+p2.name)
#PySpark Example
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
rdd2 = spark.sparkContext.parallelize([],10)
data = [Row(name="James,,Smith",lang=["Java","Scala","C++"],state="CA"),
Row(name="Michael,Rose,",lang=["Spark","Java","C++"],state="NJ"),
Row(name="Robert,,Williams",lang=["CSharp","VB"],state="NV")]
#RDD Example 1
rdd=spark.sparkContext.parallelize(data)
collData=rdd.collect()
print(collData)
for row in collData:
print(row.name + "," +str(row.lang))
# RDD Example 2
Person=Row("name","lang","state")
data = [Person("James,,Smith",["Java","Scala","C++"],"CA"),
Person("Michael,Rose,",["Spark","Java","C++"],"NJ"),
Person("Robert,,Williams",["CSharp","VB"],"NV")]
rdd=spark.sparkContext.parallelize(data)
collData=rdd.collect()
print(collData)
for person in collData:
print(person.name + "," +str(person.lang))
#DataFrame Example 1
columns = ["name","languagesAtSchool","currentState"]
df=spark.createDataFrame(data)
df.printSchema()
df.show()
collData=df.collect()
print(collData)
for row in collData:
print(row.name + "," +str(row.lang))
#DataFrame Example 2
data = [("James,,Smith",["Java","Scala","C++"],"CA"),
("Michael,Rose,",["Spark","Java","C++"],"NJ"),
("Robert,,Williams",["CSharp","VB"],"NV")]
columns = ["name","languagesAtSchool","currentState"]
df=spark.createDataFrame(data).toDF(*columns)
df.printSchema()
for row in df.collect():
print(row.name)