-
Notifications
You must be signed in to change notification settings - Fork 474
/
Copy pathpyspark-session-2018-10-02.txt
executable file
·100 lines (99 loc) · 2.87 KB
/
pyspark-session-2018-10-02.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
mparsian@Mahmouds-MacBook ~/spark-2.3.0 $ ./zbin/zenv_setup.sh
mparsian@Mahmouds-MacBook ~/spark-2.3.0 $ ./bin/pyspark
Python 2.7.10 (default, Oct 6 2017, 22:29:07)
[GCC 4.2.1 Compatible Apple LLVM 9.0.0 (clang-900.0.31)] on darwin
Type "help", "copyright", "credits" or "license" for more information.
18/10/02 15:50:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/__ / .__/\_,_/_/ /_/\_\ version 2.3.0
/_/
Using Python version 2.7.10 (default, Oct 6 2017 22:29:07)
SparkSession available as 'spark'.
>>>
>>>
>>>
>>>
>>> spark
<pyspark.sql.session.SparkSession object at 0x1082bcc50>
>>> spark.sparkContext
<SparkContext master=local[*] appName=PySparkShell>
>>>
>>> spark.version
u'2.3.0'
>>>
>>>
>>>
>>>
>>>
>>>
>>> input_path = "/Users/mparsian/spark-2.3.0/myinput.txt"
>>> myrdd = spark.sparkContext.textFile(input_path)
>>> rdd.count()
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
NameError: name 'rdd' is not defined
>>> myrdd.count()
3
>>> myrdd.collect()
[u'this is record 1', u'this is record 2', u'this is record 3']
>>>
>>>
>>> def tokenize(rec):
... tokens = rec.split()
... return tokens
...
>>>
>>> rec33 = "this is it"
>>> mytokens = tokenize(rec33)
>>> mytokens
['this', 'is', 'it']
>>>
>>>
>>> words = myrdd.flatMap(lambda record: tokenize(record))
>>> words.collect()
[u'this', u'is', u'record', u'1', u'this', u'is', u'record', u'2', u'this', u'is', u'record', u'3']
>>> words.count()
12
>>>
>>> duplicated = myrdd.map(lambda rec: rec + ";" rec)
File "<stdin>", line 1
duplicated = myrdd.map(lambda rec: rec + ";" rec)
^
SyntaxError: invalid syntax
>>> duplicated = myrdd.map(lambda rec: rec + ";" + rec)
>>> duplicated.count()
3
>>> duplicated.collect()
[u'this is record 1;this is record 1', u'this is record 2;this is record 2', u'this is record 3;this is record 3']
>>>
>>> def myconcat(rec):
... return rec + ";" + rec
...
>>>
>>> z = myconcat("testing")
>>> z
'testing;testing'
>>> duplicated2 = myrdd.map(myconcat)
>>> duplicated2.count()
3
>>> duplicated2.collect()
[u'this is record 1;this is record 1', u'this is record 2;this is record 2', u'this is record 3;this is record 3']
>>>
>>>
>>>
>>> words.collect()
[u'this', u'is', u'record', u'1', u'this', u'is', u'record', u'2', u'this', u'is', u'record', u'3']
>>> words.count()
12
>>> pairs = words.map(lambda w: (w, 1))
>>> pairs.collect()
[(u'this', 1), (u'is', 1), (u'record', 1), (u'1', 1), (u'this', 1), (u'is', 1), (u'record', 1), (u'2', 1), (u'this', 1), (u'is', 1), (u'record', 1), (u'3', 1)]
>>> pairs.count()
12
>>> freq = pairs.reduceByKey(lambda x, y : x+y)
>>> freq.collect()
[(u'this', 3), (u'1', 1), (u'is', 3), (u'3', 1), (u'record', 3), (u'2', 1)]
>>>