forked from spark-examples/pyspark-examples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
pyspark-arraytype.py
44 lines (34 loc) · 1.38 KB
/
pyspark-arraytype.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# -*- coding: utf-8 -*-
"""
author SparkByExamples.com
"""
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, ArrayType,StructType,StructField
spark = SparkSession.builder \
.appName('SparkByExamples.com') \
.getOrCreate()
arrayCol = ArrayType(StringType(),False)
data = [
("James,,Smith",["Java","Scala","C++"],["Spark","Java"],"OH","CA"),
("Michael,Rose,",["Spark","Java","C++"],["Spark","Java"],"NY","NJ"),
("Robert,,Williams",["CSharp","VB"],["Spark","Python"],"UT","NV")
]
schema = StructType([
StructField("name",StringType(),True),
StructField("languagesAtSchool",ArrayType(StringType()),True),
StructField("languagesAtWork",ArrayType(StringType()),True),
StructField("currentState", StringType(), True),
StructField("previousState", StringType(), True)
])
df = spark.createDataFrame(data=data,schema=schema)
df.printSchema()
df.show()
from pyspark.sql.functions import explode
df.select(df.name,explode(df.languagesAtSchool)).show()
from pyspark.sql.functions import split
df.select(split(df.name,",").alias("nameAsArray")).show()
from pyspark.sql.functions import array
df.select(df.name,array(df.currentState,df.previousState).alias("States")).show()
from pyspark.sql.functions import array_contains
df.select(df.name,array_contains(df.languagesAtSchool,"Java")
.alias("array_contains")).show()