tutorial/pyspark-examples/rdds/pyspark-session-2022-04-12.txt

~  % cd spark-3.2.0
spark-3.2.0  % ls -l
total 192
-rwxrwxrwx@   1 mparsian  staff  22878 Oct  6  2021 LICENSE
-rwxrwxrwx@   1 mparsian  staff  57677 Oct  6  2021 NOTICE
drwxrwxrwx@   3 mparsian  staff     96 Oct  6  2021 R
-rwxrwxrwx@   1 mparsian  staff   4512 Oct  6  2021 README.md
-rwxrwxrwx@   1 mparsian  staff    167 Oct  6  2021 RELEASE
drwxrwxrwx@  29 mparsian  staff    928 Nov 17 18:15 bin
drwxrwxrwx@   9 mparsian  staff    288 Nov 17 18:15 conf
drwxrwxrwx@   5 mparsian  staff    160 Nov 17 18:15 data
drwxrwxrwx@   4 mparsian  staff    128 Oct  6  2021 examples
drwxrwxrwx@ 237 mparsian  staff   7584 Nov 17 18:15 jars
drwxrwxrwx@   4 mparsian  staff    128 Nov 17 18:15 kubernetes
drwxrwxrwx@  60 mparsian  staff   1920 Nov 17 18:15 licenses
drwxrwxrwx@  20 mparsian  staff    640 Nov 17 18:15 python
drwxrwxrwx@  29 mparsian  staff    928 Nov 17 18:15 sbin
drwxrwxrwx@   3 mparsian  staff     96 Oct  6  2021 yarn

spark-3.2.0  % ./bin/pyspark
Python 3.8.9 (default, Mar 30 2022, 13:51:17)
[Clang 13.1.6 (clang-1316.0.21.2.3)] on darwin
Type "help", "copyright", "credits" or "license" for more information.
Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 3.2.0
      /_/

Using Python version 3.8.9 (default, Mar 30 2022 13:51:17)
Spark context Web UI available at http://10.0.0.234:4040
Spark context available as 'sc' (master = local[*], app id = local-1649822374103).
SparkSession available as 'spark'.
>>>
>>>
>>> spark.version
'3.2.0'
>>>
>>>
>>> numbers = [1, 2, 5, 6, 7, 8, 9, 10, 30, 40, 50]
>>> numbers
[1, 2, 5, 6, 7, 8, 9, 10, 30, 40, 50]
>>> # rdd = Resilient Dist. Dataset
>>> rdd = spark.sparkContext.parallelize(numbers)
>>> rdd.collect()
[1, 2, 5, 6, 7, 8, 9, 10, 30, 40, 50]
>>> # rdd is partitioned, read-only, operates in parallel
>>> rdd.count()
11
>>> total = rdd.reduce(lambda x, y: x+y)
>>> total
168
>>> rdd_greater_than_20 = rdd.filter(lambda x : x > 20)
>>> rdd_greater_than_20.collect()
[30, 40, 50]
>>>
>>> rdd_greater_than_20.count()
3
>>> rdd.take(3)
[1, 2, 5]
>>>
>>> ^D