-
Notifications
You must be signed in to change notification settings - Fork 474
/
Copy pathpyspark-session-2021-01-19.txt
executable file
·126 lines (118 loc) · 3.64 KB
/
pyspark-session-2021-01-19.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
$ ./bin/pyspark
Python 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018, 02:44:43)
[Clang 6.0 (clang-600.0.57)] on darwin
Type "help", "copyright", "credits" or "license" for more information.
21/01/19 20:03:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/__ / .__/\_,_/_/ /_/\_\ version 3.0.0
/_/
Using Python version 3.7.2 (v3.7.2:9a3ffc0492, Dec 24 2018 02:44:43)
SparkSession available as 'spark'.
>>>
>>>
>>> tuples2 = [('alex', 4), ('alex', 5), ('bob', 40), ('bob', 50), ('bob', 4)]
>>> tuples2
[('alex', 4), ('alex', 5), ('bob', 40), ('bob', 50), ('bob', 4)]
>>>
>>>
>>> pairs_rdd = spark.sparkContext.parallelize(tuples2)
>>> pairs_rdd
ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:262
>>> pairs_rdd.collect()
[('alex', 4), ('alex', 5), ('bob', 40), ('bob', 50), ('bob', 4)]
>>> pairs_rdd.count()
5
>>> tuples33 = [('alex', 4, 44), ('alex', 5, 55), ('bob', 40, 66)]
>>> tuples33
[('alex', 4, 44), ('alex', 5, 55), ('bob', 40, 66)]
>>> rdd = spark.sparkContext.parallelize(tuples33)
>>>
>>> rdd.collect()
[('alex', 4, 44), ('alex', 5, 55), ('bob', 40, 66)]
>>> rdd.count()
3
>>>
>>>
>>>
>>> pairs_rdd.collect()
[('alex', 4), ('alex', 5), ('bob', 40), ('bob', 50), ('bob', 4)]
>>> new_rdd = pairs_rdd.map(lambda x: (x[0], x[1], 2*int(x[1])))
>>> new_rdd.collect()
[('alex', 4, 8), ('alex', 5, 10), ('bob', 40, 80), ('bob', 50, 100), ('bob', 4, 8)]
>>>
>>> columns = ["name", "age", "salary"]
>>> some_tuples = [('alex', 40, 80000), ('alex', 50, 1000000), ('bob', 40, 8000000), ('bob', 50, 10000)]
>>> df = spark.createDataFrame(some_tuples, columns)
>>> df.show()
+----+---+-------+
|name|age| salary|
+----+---+-------+
|alex| 40| 80000|
|alex| 50|1000000|
| bob| 40|8000000|
| bob| 50| 10000|
+----+---+-------+
>>> df.printSchema()
root
|-- name: string (nullable = true)
|-- age: long (nullable = true)
|-- salary: long (nullable = true)
>>> rdd = spark.sparkContext.parallelize(some_tuples)
>>> rdd.collect()
[('alex', 40, 80000), ('alex', 50, 1000000), ('bob', 40, 8000000), ('bob', 50, 10000)]
>>> rdd.take(2)
[('alex', 40, 80000), ('alex', 50, 1000000)]
>>>
>>>
>>> data = ["alex,20", "alex,30", "bob,40", "bob,50", "bob,60"]
>>> data
['alex,20', 'alex,30', 'bob,40', 'bob,50', 'bob,60']
>>>
>>>
>>> rdd = spark.sparkContext.parallelize(data)
>>> rdd.collect()
['alex,20', 'alex,30', 'bob,40', 'bob,50', 'bob,60']
>>> rdd.count()
5
>>> def create_pairs(rec):
... tokens = rec.split(",")
... key = tokens[0]
... value = tokens[1]
... return (key, value)
...
>>>
>>> pairs = rdd.map(lambda x: create_pairs(x))
>>> pairs.collect()
[('alex', '20'), ('alex', '30'), ('bob', '40'), ('bob', '50'), ('bob', '60')]
>>> pairs.count()
5
>>> pairs = rdd.map(create_pairs)
>>> pairs.collect()
[('alex', '20'), ('alex', '30'), ('bob', '40'), ('bob', '50'), ('bob', '60')]
>>> pairs.count()
5
>>>
>>> sum_by_key = pairs.reduceByKey(lambda x, y: x+y)
>>> sum_by_key.collect()
[('bob', '405060'), ('alex', '2030')]
>>>
>>> def create_pair(rec):
... tokens = rec.split(",")
... key = tokens[0]
... value = int(tokens[1])
... return (key, value)
...
>>>
>>> rdd2 = rdd.map(lambda x: create_pair(x))
>>> rdd2.collect()
[('alex', 20), ('alex', 30), ('bob', 40), ('bob', 50), ('bob', 60)]
>>> sum_by_key = rdd2.reduceByKey(lambda x, y: x+y)
>>> sum_by_key.collect()
[('bob', 150), ('alex', 50)]
>>>