mrjob AWS e Local

samuelsantosdev · samuelsantosdev · commit b5f87cb85f1b · 2019-11-18T16:07:24.000-03:00
diff --git a/README.md b/README.md
@@ -15,9 +15,11 @@ SPARK_PATH=<path_to_projetct>/spark/spark-3.0.0-preview-bin-hadoop2.7/
 SPARK_MEM=1gb
 APP_NAME=Spark Hadoop Teste
 
-### Run the test
+### Run 
 ```
-$ python main.py
+$ cd pyspark
+$ pip install -r requirements
+$ python run.py
 +--------+--------+--------+-------+------------------+                         
 |    Name|Year2000|Year2010|   Diff|          Increase|
 +--------+--------+--------+-------+------------------+
@@ -34,4 +36,41 @@ $ python main.py
 +--------+--------+--------+-------+------------------+
 ```
 
-# Examplo MrJob
+# Examplo MrJob
+
+### Running MrJob Local
+```
+$ cd mrjob
+$ pip install -r requirements
+$ python count_words.py data/input_data.txt > data/output.txt
+```
+
+### Running MrJob in AWS EMR
+```
+$ aws emr create-default-roles 
+```
+set config .mrjob.conf with AWS credentials
+
+```
+$ cd mrjob
+$ pip install -r requirements
+$ python count_words.py data/input_data.txt -r emr
+
+Using s3://mrjob-bb3fb02bec0467d2/tmp/ as our temp dir on S3
+Creating temp directory /tmp/count_words.marbeik.20191118.185622.785260
+writing master bootstrap script to /tmp/count_words.marbeik.20191118.185622.785260/b.sh
+uploading working dir files to s3://mrjob-bb3fb02bec0467d2/tmp/count_words.marbeik.20191118.185622.785260/files/wd...
+Copying other local files to s3://mrjob-bb3fb02bec0467d2/tmp/count_words.marbeik.20191118.185622.785260/files/
+Created new cluster j-1ZY5Z37LMZHWF
+Added EMR tags to cluster j-1ZY5Z37LMZHWF: __mrjob_label=count_words, __mrjob_owner=marbeik, __mrjob_version=0.6.12
+Waiting for Step 1 of 1 (s-1TFAJLTSJSWXK) to complete...
+  PENDING (cluster is STARTING)
+  PENDING (cluster is STARTING)
+  PENDING (cluster is BOOTSTRAPPING: Running bootstrap actions)
+  PENDING (cluster is BOOTSTRAPPING: Running bootstrap actions)
+  PENDING (cluster is BOOTSTRAPPING: Running bootstrap actions)
+  PENDING (cluster is RUNNING: Running step)
+  master node is ec2-52-42-249-126.us-west-2.compute.amazonaws.com
+  RUNNING for 0:00:33 ...
+
+```
diff --git a/mrjob/.mrjob.conf b/mrjob/.mrjob.conf
@@ -0,0 +1,7 @@
+runners:
+  emr:
+    aws_access_key_id: KEY
+    aws_secret_access_key: SECRETE
+    core_instance_type: t3.nano
+    num_core_instances: 4
+    region: us-west-2
diff --git a/mrjob/__init__.py b/mrjob/__init__.py
diff --git a/mrjob/count_words.py b/mrjob/count_words.py
@@ -0,0 +1,64 @@
+import re, os, sys, argparse as ap
+from operator import add
+from mrjob.job import MRJob
+
+WORD_RE = re.compile(r"[\w']+")
+
+class MRSparkWordcountAWS(MRJob):
+
+    def spark(self, input_path, output_path):
+        # Spark may not be available where script is launched
+        from pyspark import SparkContext
+
+        sc = SparkContext(appName='mrjob Spark wordcount script')
+
+        lines = sc.textFile(input_path)
+
+        counts = (
+            lines.flatMap(self.get_words)
+            .map(lambda word: (word, 1))
+            .reduceByKey(add))
+
+        counts.saveAsTextFile(output_path)
+
+        sc.stop()
+
+    def get_words(self, line):
+        return [w.lower() for w in WORD_RE.findall(line)]
+
+class MRSparkWordcount(MRJob):
+    
+    def spark(self, input_path, output_path):
+        # Spark may not be available where script is launched
+        from pyspark.sql import SparkSession
+        from pyspark import SparkContext
+        import pyspark.sql.functions as f
+        import settings
+
+        spark = SparkSession.builder \
+            .master("local") \
+            .appName( settings.APP_NAME ) \
+            .config("spark.executor.memory", settings.SPARK_MEM ) \
+            .getOrCreate()
+
+        sc = spark.sparkContext
+
+        lines = sc.textFile(input_path)
+
+        counts = ( lines.flatMap(self.get_words)
+            .map(lambda word: (word, 1))
+            .reduceByKey(add) )
+
+        counts.saveAsTextFile(output_path)
+
+        sc.stop()
+
+    def get_words(self, line):
+        return [w.lower() for w in WORD_RE.findall(line)]
+
+if __name__ == '__main__':
+    if len(sys.argv) >= 4 :
+        if str(sys.argv[3]) == 'emr':
+            MRSparkWordcountAWS.run()
+    else:
+        MRSparkWordcount.run()
diff --git a/mrjob/data/input_data.txt b/mrjob/data/input_data.txt
@@ -0,0 +1,12 @@
+King had been preaching about dreams since 1960, when he gave a speech to the National Association for the Advancement of Colored People (NAACP) called "The Negro and the American Dream". This speech discusses the gap between the American dream and reality, saying that overt white supremacists have violated the dream, and that "our federal government has also scarred the dream through its apathy and hypocrisy, its betrayal of the cause of justice". King suggests that "It may well be that the Negro is God's instrument to save the soul of America."[10][11] In 1961, he spoke of the Civil Rights Movement and student activists' "dream" of equality—"the American Dream ... a dream as yet unfulfilled"—in several national speeches and statements, and took "the dream" as the centerpiece for these speeches.[12]
+On November 27, 1962, King gave a speech at Booker T. Washington High School in Rocky Mount, North Carolina. That speech was longer than the version which he would eventually deliver from the Lincoln Memorial. And while parts of the text had been moved around, large portions were identical, including the "I have a dream" refrain.[13][14] After being rediscovered,[15] the restored and digitized recording of the 1962 speech was presented to the public by the English department of North Carolina State University.[13]
+King had also delivered a "dream" speech in Detroit, in June 1963, when he marched on Woodward Avenue with Walter Reuther and the Reverend C. L. Franklin, and had rehearsed other parts.[16] Mahalia Jackson, who sang "How I Got Over",[17] just before the speech in Washington, knew about King's Detroit speech.[18]
+The March on Washington Speech, known as "I Have a Dream Speech", has been shown to have had several versions, written at several different times.[19] It has no single version draft, but is an amalgamation of several drafts, and was originally called "Normalcy, Never Again". Little of this, and another "Normalcy Speech", ended up in the final draft. A draft of "Normalcy, Never Again" is housed in the Morehouse College Martin Luther King Jr. Collection of the Robert W. Woodruff Library, Atlanta University Center and Morehouse College.[20] The focus on "I have a dream" comes through the speech's delivery. Toward the end of its delivery, noted African American gospel singer Mahalia Jackson shouted to King from the crowd, "Tell them about the dream, Martin."[21] King departed from his prepared remarks and started "preaching" improvisationally, punctuating his points with "I have a dream."
+The speech was drafted with the assistance of Stanley Levison and Clarence Benjamin Jones[22] in Riverdale, New York City. Jones has said that "the logistical preparations for the march were so burdensome that the speech was not a priority for us" and that, "on the evening of Tuesday, Aug. 27, [12 hours before the march] Martin still didn't know what he was going to say".[23]
+Leading up to the speech's rendition at the Great March on Washington, King had delivered its "I have a dream" refrains in his speech before 25,000 people in Detroit's Cobo Hall immediately after the 125,000-strong Great Walk to Freedom in Detroit, June 23, 1963.[24][25] After the Washington, D.C. March, a recording of King's Cobo Hall speech was released by Detroit's Gordy Records as an LP entitled "The Great March To Freedom".[26]
+King had been preaching about dreams since 1960, when he gave a speech to the National Association for the Advancement of Colored People (NAACP) called "The Negro and the American Dream". This speech discusses the gap between the American dream and reality, saying that overt white supremacists have violated the dream, and that "our federal government has also scarred the dream through its apathy and hypocrisy, its betrayal of the cause of justice". King suggests that "It may well be that the Negro is God's instrument to save the soul of America."[10][11] In 1961, he spoke of the Civil Rights Movement and student activists' "dream" of equality—"the American Dream ... a dream as yet unfulfilled"—in several national speeches and statements, and took "the dream" as the centerpiece for these speeches.[12]
+On November 27, 1962, King gave a speech at Booker T. Washington High School in Rocky Mount, North Carolina. That speech was longer than the version which he would eventually deliver from the Lincoln Memorial. And while parts of the text had been moved around, large portions were identical, including the "I have a dream" refrain.[13][14] After being rediscovered,[15] the restored and digitized recording of the 1962 speech was presented to the public by the English department of North Carolina State University.[13]
+King had also delivered a "dream" speech in Detroit, in June 1963, when he marched on Woodward Avenue with Walter Reuther and the Reverend C. L. Franklin, and had rehearsed other parts.[16] Mahalia Jackson, who sang "How I Got Over",[17] just before the speech in Washington, knew about King's Detroit speech.[18]
+The March on Washington Speech, known as "I Have a Dream Speech", has been shown to have had several versions, written at several different times.[19] It has no single version draft, but is an amalgamation of several drafts, and was originally called "Normalcy, Never Again". Little of this, and another "Normalcy Speech", ended up in the final draft. A draft of "Normalcy, Never Again" is housed in the Morehouse College Martin Luther King Jr. Collection of the Robert W. Woodruff Library, Atlanta University Center and Morehouse College.[20] The focus on "I have a dream" comes through the speech's delivery. Toward the end of its delivery, noted African American gospel singer Mahalia Jackson shouted to King from the crowd, "Tell them about the dream, Martin."[21] King departed from his prepared remarks and started "preaching" improvisationally, punctuating his points with "I have a dream."
+The speech was drafted with the assistance of Stanley Levison and Clarence Benjamin Jones[22] in Riverdale, New York City. Jones has said that "the logistical preparations for the march were so burdensome that the speech was not a priority for us" and that, "on the evening of Tuesday, Aug. 27, [12 hours before the march] Martin still didn't know what he was going to say".[23]
+Leading up to the speech's rendition at the Great March on Washington, King had delivered its "I have a dream" refrains in his speech before 25,000 people in Detroit's Cobo Hall immediately after the 125,000-strong Great Walk to Freedom in Detroit, June 23, 1963.[24][25] After the Washington, D.C. March, a recording of King's Cobo Hall speech was released by Detroit's Gordy Records as an LP entitled "The Great March To Freedom".[26]
diff --git a/mrjob/data/output b/mrjob/data/output
diff --git a/mrjob/requirements.txt b/mrjob/requirements.txt
@@ -0,0 +1,32 @@
+boto3==1.10.19
+botocore==1.13.19
+cachetools==3.1.1
+certifi==2019.9.11
+chardet==3.0.4
+docutils==0.15.2
+findspark==1.3.0
+google-api-core==1.14.3
+google-auth==1.7.1
+google-cloud-core==1.0.3
+google-cloud-dataproc==0.6.1
+google-cloud-logging==1.14.0
+google-cloud-storage==1.23.0
+google-resumable-media==0.5.0
+googleapis-common-protos==1.6.0
+grpcio==1.25.0
+idna==2.8
+jmespath==0.9.4
+mrjob==0.6.12
+protobuf==3.10.0
+py4j==0.10.7
+pyasn1==0.4.8
+pyasn1-modules==0.2.7
+pyspark==2.4.4
+python-dateutil==2.8.0
+pytz==2019.3
+PyYAML==5.1.2
+requests==2.22.0
+rsa==4.0
+s3transfer==0.2.1
+six==1.13.0
+urllib3==1.25.7
diff --git a/mrjob/settings.py b/mrjob/settings.py
@@ -0,0 +1,5 @@
+import os
+
+APP_NAME=os.getenv('APP_NAME', "Spark Hadoop")
+SPARK_MEM=os.getenv('SPARK_MEM', "1gb")
+SPARK_PATH=os.getenv( "SPARK_PATH" , "{}{}".format(os.getcwd(), '/../spark/spark-3.0.0-preview-bin-hadoop2.7/') )
diff --git a/pyspark/__init__.py b/pyspark/__init__.py
diff --git a/pyspark/main.py b/pyspark/main.py
@@ -1,7 +1,49 @@
-import findspark, os, settings
+from pyspark.sql import SparkSession, Row
+from pyspark.sql.types import *
+from pyspark.sql.functions import grouping, col, desc, asc
+from operator import add
+import os
+import settings
 
-# Carregar Spark com Hadoop
-findspark.init( settings.SPARK_PATH )
+def main():
+    # Build the SparkSession
+    spark = SparkSession.builder \
+        .master("local") \
+        .appName( settings.APP_NAME ) \
+        .config("spark.executor.memory", settings.SPARK_MEM ) \
+        .getOrCreate()
+    
+    sc = spark.sparkContext
 
-import runmr
-runmr.main()
+    #Carregar arquivo CSV
+    rdd = sc.textFile( "{}{}".format(os.getcwd(), '/data/nomes-censos-ibge.csv') )
+
+    #Dividir colunas com split
+    rdd = rdd.map(lambda line: line.split(","))
+
+    #Criar DataFrame
+    df = rdd.map(lambda line: Row(Name=line[0],  Year2000=line[8], Year2010=line[9], Diff=defaultZero(line[9]) - defaultZero(line[8]) ) ).toDF()
+    df = convertColumn(df, ["Year2000", "Year2010", "Diff"], FloatType())
+    
+    #Filtrar por crescimento acima de 10000 mil a cada 10 anos
+    df.createOrReplaceTempView("names")
+    df2 = spark.sql("SELECT Name, Year2000, Year2010, Diff, (Diff/Year2000*100) as Increase from names where (Year2010 - Year2000) > 10000 order by (Diff/Year2000*100) DESC")
+    
+    #Top 10 nomes que mais se tornaram comuns entre 2000 e 2010
+    df2.show(n=10)
+
+
+def convertColumn(df, columns, Type):
+    for name in columns :
+        df = df.withColumn(name, df[name].cast(Type))
+    return df
+
+def defaultZero(value):
+    try:
+        return int(float(0 if value is None else value))
+    except:
+        return 0
+
+if __name__ == '__main__':
+    main()
+    pass
diff --git a/pyspark/requirements.txt b/pyspark/requirements.txt
@@ -1,3 +1,32 @@
+boto3==1.10.19
+botocore==1.13.19
+cachetools==3.1.1
+certifi==2019.9.11
+chardet==3.0.4
+docutils==0.15.2
 findspark==1.3.0
+google-api-core==1.14.3
+google-auth==1.7.1
+google-cloud-core==1.0.3
+google-cloud-dataproc==0.6.1
+google-cloud-logging==1.14.0
+google-cloud-storage==1.23.0
+google-resumable-media==0.5.0
+googleapis-common-protos==1.6.0
+grpcio==1.25.0
+idna==2.8
+jmespath==0.9.4
+mrjob==0.6.12
+protobuf==3.10.0
 py4j==0.10.7
-pyspark==2.3.2
+pyasn1==0.4.8
+pyasn1-modules==0.2.7
+pyspark==2.4.4
+python-dateutil==2.8.0
+pytz==2019.3
+PyYAML==5.1.2
+requests==2.22.0
+rsa==4.0
+s3transfer==0.2.1
+six==1.13.0
+urllib3==1.25.7
diff --git a/pyspark/run.py b/pyspark/run.py
@@ -0,0 +1,8 @@
+import findspark, os
+import settings
+
+# Carregar Spark com Hadoop
+findspark.init( settings.SPARK_PATH )
+
+import main
+main.main()
diff --git a/pyspark/runmr.py b/pyspark/runmr.py
diff --git a/pyspark/settings.py b/pyspark/settings.py
@@ -1,3 +1,5 @@
 import os
 
-SPARK_PATH=os.getenv( "SPARK_PATH" , "{}{}".format(os.getcwd(), '/spark/spark-3.0.0-preview-bin-hadoop2.7/') )
+APP_NAME=os.getenv('APP_NAME', "Spark Hadoop")
+SPARK_MEM=os.getenv('SPARK_MEM', "1gb")
+SPARK_PATH=os.getenv( "SPARK_PATH" , "{}{}".format(os.getcwd(), '/../spark/spark-3.0.0-preview-bin-hadoop2.7/') )
diff --git a/pyspark/spark/README.md b/pyspark/spark/README.md