-
Notifications
You must be signed in to change notification settings - Fork 900
/
pyspark-cast-column.py
39 lines (31 loc) · 1.22 KB
/
pyspark-cast-column.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# -*- coding: utf-8 -*-
"""
Created on Sat Jun 13 21:08:30 2020
@author: NNK
"""
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
simpleData = [("James",34,"2006-01-01","true","M",3000.60),
("Michael",33,"1980-01-10","true","F",3300.80),
("Robert",37,"06-01-1992","false","M",5000.50)
]
columns = ["firstname","age","jobStartDate","isGraduated","gender","salary"]
df = spark.createDataFrame(data = simpleData, schema = columns)
df.printSchema()
df.show(truncate=False)
from pyspark.sql.functions import col
from pyspark.sql.types import StringType,BooleanType,DateType
df2 = df.withColumn("age",col("age").cast(StringType())) \
.withColumn("isGraduated",col("isGraduated").cast(BooleanType())) \
.withColumn("jobStartDate",col("jobStartDate").cast(DateType()))
df2.printSchema()
df3 = df2.selectExpr("cast(age as int) age",
"cast(isGraduated as string) isGraduated",
"cast(jobStartDate as string) jobStartDate")
df3.printSchema()
df3.show(truncate=False)
df3.createOrReplaceTempView("CastExample")
df4 = spark.sql("SELECT STRING(age),BOOLEAN(isGraduated),DATE(jobStartDate) from CastExample")
df4.printSchema()
df4.show(truncate=False)