Skip to content

Commit b5f87cb

Browse files
mrjob AWS e Local
1 parent b632c31 commit b5f87cb

15 files changed

+250
-63
lines changed

README.md

+42-3
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,11 @@ SPARK_PATH=<path_to_projetct>/spark/spark-3.0.0-preview-bin-hadoop2.7/
1515
SPARK_MEM=1gb
1616
APP_NAME=Spark Hadoop Teste
1717

18-
### Run the test
18+
### Run
1919
```
20-
$ python main.py
20+
$ cd pyspark
21+
$ pip install -r requirements
22+
$ python run.py
2123
+--------+--------+--------+-------+------------------+
2224
| Name|Year2000|Year2010| Diff| Increase|
2325
+--------+--------+--------+-------+------------------+
@@ -34,4 +36,41 @@ $ python main.py
3436
+--------+--------+--------+-------+------------------+
3537
```
3638

37-
# Examplo MrJob
39+
# Examplo MrJob
40+
41+
### Running MrJob Local
42+
```
43+
$ cd mrjob
44+
$ pip install -r requirements
45+
$ python count_words.py data/input_data.txt > data/output.txt
46+
```
47+
48+
### Running MrJob in AWS EMR
49+
```
50+
$ aws emr create-default-roles
51+
```
52+
set config .mrjob.conf with AWS credentials
53+
54+
```
55+
$ cd mrjob
56+
$ pip install -r requirements
57+
$ python count_words.py data/input_data.txt -r emr
58+
59+
Using s3://mrjob-bb3fb02bec0467d2/tmp/ as our temp dir on S3
60+
Creating temp directory /tmp/count_words.marbeik.20191118.185622.785260
61+
writing master bootstrap script to /tmp/count_words.marbeik.20191118.185622.785260/b.sh
62+
uploading working dir files to s3://mrjob-bb3fb02bec0467d2/tmp/count_words.marbeik.20191118.185622.785260/files/wd...
63+
Copying other local files to s3://mrjob-bb3fb02bec0467d2/tmp/count_words.marbeik.20191118.185622.785260/files/
64+
Created new cluster j-1ZY5Z37LMZHWF
65+
Added EMR tags to cluster j-1ZY5Z37LMZHWF: __mrjob_label=count_words, __mrjob_owner=marbeik, __mrjob_version=0.6.12
66+
Waiting for Step 1 of 1 (s-1TFAJLTSJSWXK) to complete...
67+
PENDING (cluster is STARTING)
68+
PENDING (cluster is STARTING)
69+
PENDING (cluster is BOOTSTRAPPING: Running bootstrap actions)
70+
PENDING (cluster is BOOTSTRAPPING: Running bootstrap actions)
71+
PENDING (cluster is BOOTSTRAPPING: Running bootstrap actions)
72+
PENDING (cluster is RUNNING: Running step)
73+
master node is ec2-52-42-249-126.us-west-2.compute.amazonaws.com
74+
RUNNING for 0:00:33 ...
75+
76+
```

mrjob/.mrjob.conf

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
runners:
2+
emr:
3+
aws_access_key_id: KEY
4+
aws_secret_access_key: SECRETE
5+
core_instance_type: t3.nano
6+
num_core_instances: 4
7+
region: us-west-2

mrjob/__init__.py

Whitespace-only changes.

mrjob/count_words.py

+64
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
import re, os, sys, argparse as ap
2+
from operator import add
3+
from mrjob.job import MRJob
4+
5+
WORD_RE = re.compile(r"[\w']+")
6+
7+
class MRSparkWordcountAWS(MRJob):
8+
9+
def spark(self, input_path, output_path):
10+
# Spark may not be available where script is launched
11+
from pyspark import SparkContext
12+
13+
sc = SparkContext(appName='mrjob Spark wordcount script')
14+
15+
lines = sc.textFile(input_path)
16+
17+
counts = (
18+
lines.flatMap(self.get_words)
19+
.map(lambda word: (word, 1))
20+
.reduceByKey(add))
21+
22+
counts.saveAsTextFile(output_path)
23+
24+
sc.stop()
25+
26+
def get_words(self, line):
27+
return [w.lower() for w in WORD_RE.findall(line)]
28+
29+
class MRSparkWordcount(MRJob):
30+
31+
def spark(self, input_path, output_path):
32+
# Spark may not be available where script is launched
33+
from pyspark.sql import SparkSession
34+
from pyspark import SparkContext
35+
import pyspark.sql.functions as f
36+
import settings
37+
38+
spark = SparkSession.builder \
39+
.master("local") \
40+
.appName( settings.APP_NAME ) \
41+
.config("spark.executor.memory", settings.SPARK_MEM ) \
42+
.getOrCreate()
43+
44+
sc = spark.sparkContext
45+
46+
lines = sc.textFile(input_path)
47+
48+
counts = ( lines.flatMap(self.get_words)
49+
.map(lambda word: (word, 1))
50+
.reduceByKey(add) )
51+
52+
counts.saveAsTextFile(output_path)
53+
54+
sc.stop()
55+
56+
def get_words(self, line):
57+
return [w.lower() for w in WORD_RE.findall(line)]
58+
59+
if __name__ == '__main__':
60+
if len(sys.argv) >= 4 :
61+
if str(sys.argv[3]) == 'emr':
62+
MRSparkWordcountAWS.run()
63+
else:
64+
MRSparkWordcount.run()

mrjob/data/input_data.txt

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
King had been preaching about dreams since 1960, when he gave a speech to the National Association for the Advancement of Colored People (NAACP) called "The Negro and the American Dream". This speech discusses the gap between the American dream and reality, saying that overt white supremacists have violated the dream, and that "our federal government has also scarred the dream through its apathy and hypocrisy, its betrayal of the cause of justice". King suggests that "It may well be that the Negro is God's instrument to save the soul of America."[10][11] In 1961, he spoke of the Civil Rights Movement and student activists' "dream" of equality—"the American Dream ... a dream as yet unfulfilled"—in several national speeches and statements, and took "the dream" as the centerpiece for these speeches.[12]
2+
On November 27, 1962, King gave a speech at Booker T. Washington High School in Rocky Mount, North Carolina. That speech was longer than the version which he would eventually deliver from the Lincoln Memorial. And while parts of the text had been moved around, large portions were identical, including the "I have a dream" refrain.[13][14] After being rediscovered,[15] the restored and digitized recording of the 1962 speech was presented to the public by the English department of North Carolina State University.[13]
3+
King had also delivered a "dream" speech in Detroit, in June 1963, when he marched on Woodward Avenue with Walter Reuther and the Reverend C. L. Franklin, and had rehearsed other parts.[16] Mahalia Jackson, who sang "How I Got Over",[17] just before the speech in Washington, knew about King's Detroit speech.[18]
4+
The March on Washington Speech, known as "I Have a Dream Speech", has been shown to have had several versions, written at several different times.[19] It has no single version draft, but is an amalgamation of several drafts, and was originally called "Normalcy, Never Again". Little of this, and another "Normalcy Speech", ended up in the final draft. A draft of "Normalcy, Never Again" is housed in the Morehouse College Martin Luther King Jr. Collection of the Robert W. Woodruff Library, Atlanta University Center and Morehouse College.[20] The focus on "I have a dream" comes through the speech's delivery. Toward the end of its delivery, noted African American gospel singer Mahalia Jackson shouted to King from the crowd, "Tell them about the dream, Martin."[21] King departed from his prepared remarks and started "preaching" improvisationally, punctuating his points with "I have a dream."
5+
The speech was drafted with the assistance of Stanley Levison and Clarence Benjamin Jones[22] in Riverdale, New York City. Jones has said that "the logistical preparations for the march were so burdensome that the speech was not a priority for us" and that, "on the evening of Tuesday, Aug. 27, [12 hours before the march] Martin still didn't know what he was going to say".[23]
6+
Leading up to the speech's rendition at the Great March on Washington, King had delivered its "I have a dream" refrains in his speech before 25,000 people in Detroit's Cobo Hall immediately after the 125,000-strong Great Walk to Freedom in Detroit, June 23, 1963.[24][25] After the Washington, D.C. March, a recording of King's Cobo Hall speech was released by Detroit's Gordy Records as an LP entitled "The Great March To Freedom".[26]
7+
King had been preaching about dreams since 1960, when he gave a speech to the National Association for the Advancement of Colored People (NAACP) called "The Negro and the American Dream". This speech discusses the gap between the American dream and reality, saying that overt white supremacists have violated the dream, and that "our federal government has also scarred the dream through its apathy and hypocrisy, its betrayal of the cause of justice". King suggests that "It may well be that the Negro is God's instrument to save the soul of America."[10][11] In 1961, he spoke of the Civil Rights Movement and student activists' "dream" of equality—"the American Dream ... a dream as yet unfulfilled"—in several national speeches and statements, and took "the dream" as the centerpiece for these speeches.[12]
8+
On November 27, 1962, King gave a speech at Booker T. Washington High School in Rocky Mount, North Carolina. That speech was longer than the version which he would eventually deliver from the Lincoln Memorial. And while parts of the text had been moved around, large portions were identical, including the "I have a dream" refrain.[13][14] After being rediscovered,[15] the restored and digitized recording of the 1962 speech was presented to the public by the English department of North Carolina State University.[13]
9+
King had also delivered a "dream" speech in Detroit, in June 1963, when he marched on Woodward Avenue with Walter Reuther and the Reverend C. L. Franklin, and had rehearsed other parts.[16] Mahalia Jackson, who sang "How I Got Over",[17] just before the speech in Washington, knew about King's Detroit speech.[18]
10+
The March on Washington Speech, known as "I Have a Dream Speech", has been shown to have had several versions, written at several different times.[19] It has no single version draft, but is an amalgamation of several drafts, and was originally called "Normalcy, Never Again". Little of this, and another "Normalcy Speech", ended up in the final draft. A draft of "Normalcy, Never Again" is housed in the Morehouse College Martin Luther King Jr. Collection of the Robert W. Woodruff Library, Atlanta University Center and Morehouse College.[20] The focus on "I have a dream" comes through the speech's delivery. Toward the end of its delivery, noted African American gospel singer Mahalia Jackson shouted to King from the crowd, "Tell them about the dream, Martin."[21] King departed from his prepared remarks and started "preaching" improvisationally, punctuating his points with "I have a dream."
11+
The speech was drafted with the assistance of Stanley Levison and Clarence Benjamin Jones[22] in Riverdale, New York City. Jones has said that "the logistical preparations for the march were so burdensome that the speech was not a priority for us" and that, "on the evening of Tuesday, Aug. 27, [12 hours before the march] Martin still didn't know what he was going to say".[23]
12+
Leading up to the speech's rendition at the Great March on Washington, King had delivered its "I have a dream" refrains in his speech before 25,000 people in Detroit's Cobo Hall immediately after the 125,000-strong Great Walk to Freedom in Detroit, June 23, 1963.[24][25] After the Washington, D.C. March, a recording of King's Cobo Hall speech was released by Detroit's Gordy Records as an LP entitled "The Great March To Freedom".[26]

mrjob/data/output

Whitespace-only changes.

mrjob/requirements.txt

+32
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
boto3==1.10.19
2+
botocore==1.13.19
3+
cachetools==3.1.1
4+
certifi==2019.9.11
5+
chardet==3.0.4
6+
docutils==0.15.2
7+
findspark==1.3.0
8+
google-api-core==1.14.3
9+
google-auth==1.7.1
10+
google-cloud-core==1.0.3
11+
google-cloud-dataproc==0.6.1
12+
google-cloud-logging==1.14.0
13+
google-cloud-storage==1.23.0
14+
google-resumable-media==0.5.0
15+
googleapis-common-protos==1.6.0
16+
grpcio==1.25.0
17+
idna==2.8
18+
jmespath==0.9.4
19+
mrjob==0.6.12
20+
protobuf==3.10.0
21+
py4j==0.10.7
22+
pyasn1==0.4.8
23+
pyasn1-modules==0.2.7
24+
pyspark==2.4.4
25+
python-dateutil==2.8.0
26+
pytz==2019.3
27+
PyYAML==5.1.2
28+
requests==2.22.0
29+
rsa==4.0
30+
s3transfer==0.2.1
31+
six==1.13.0
32+
urllib3==1.25.7

mrjob/settings.py

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
import os
2+
3+
APP_NAME=os.getenv('APP_NAME', "Spark Hadoop")
4+
SPARK_MEM=os.getenv('SPARK_MEM', "1gb")
5+
SPARK_PATH=os.getenv( "SPARK_PATH" , "{}{}".format(os.getcwd(), '/../spark/spark-3.0.0-preview-bin-hadoop2.7/') )

pyspark/__init__.py

Whitespace-only changes.

pyspark/main.py

+47-5
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,49 @@
1-
import findspark, os, settings
1+
from pyspark.sql import SparkSession, Row
2+
from pyspark.sql.types import *
3+
from pyspark.sql.functions import grouping, col, desc, asc
4+
from operator import add
5+
import os
6+
import settings
27

3-
# Carregar Spark com Hadoop
4-
findspark.init( settings.SPARK_PATH )
8+
def main():
9+
# Build the SparkSession
10+
spark = SparkSession.builder \
11+
.master("local") \
12+
.appName( settings.APP_NAME ) \
13+
.config("spark.executor.memory", settings.SPARK_MEM ) \
14+
.getOrCreate()
15+
16+
sc = spark.sparkContext
517

6-
import runmr
7-
runmr.main()
18+
#Carregar arquivo CSV
19+
rdd = sc.textFile( "{}{}".format(os.getcwd(), '/data/nomes-censos-ibge.csv') )
20+
21+
#Dividir colunas com split
22+
rdd = rdd.map(lambda line: line.split(","))
23+
24+
#Criar DataFrame
25+
df = rdd.map(lambda line: Row(Name=line[0], Year2000=line[8], Year2010=line[9], Diff=defaultZero(line[9]) - defaultZero(line[8]) ) ).toDF()
26+
df = convertColumn(df, ["Year2000", "Year2010", "Diff"], FloatType())
27+
28+
#Filtrar por crescimento acima de 10000 mil a cada 10 anos
29+
df.createOrReplaceTempView("names")
30+
df2 = spark.sql("SELECT Name, Year2000, Year2010, Diff, (Diff/Year2000*100) as Increase from names where (Year2010 - Year2000) > 10000 order by (Diff/Year2000*100) DESC")
31+
32+
#Top 10 nomes que mais se tornaram comuns entre 2000 e 2010
33+
df2.show(n=10)
34+
35+
36+
def convertColumn(df, columns, Type):
37+
for name in columns :
38+
df = df.withColumn(name, df[name].cast(Type))
39+
return df
40+
41+
def defaultZero(value):
42+
try:
43+
return int(float(0 if value is None else value))
44+
except:
45+
return 0
46+
47+
if __name__ == '__main__':
48+
main()
49+
pass

pyspark/requirements.txt

+30-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,32 @@
1+
boto3==1.10.19
2+
botocore==1.13.19
3+
cachetools==3.1.1
4+
certifi==2019.9.11
5+
chardet==3.0.4
6+
docutils==0.15.2
17
findspark==1.3.0
8+
google-api-core==1.14.3
9+
google-auth==1.7.1
10+
google-cloud-core==1.0.3
11+
google-cloud-dataproc==0.6.1
12+
google-cloud-logging==1.14.0
13+
google-cloud-storage==1.23.0
14+
google-resumable-media==0.5.0
15+
googleapis-common-protos==1.6.0
16+
grpcio==1.25.0
17+
idna==2.8
18+
jmespath==0.9.4
19+
mrjob==0.6.12
20+
protobuf==3.10.0
221
py4j==0.10.7
3-
pyspark==2.3.2
22+
pyasn1==0.4.8
23+
pyasn1-modules==0.2.7
24+
pyspark==2.4.4
25+
python-dateutil==2.8.0
26+
pytz==2019.3
27+
PyYAML==5.1.2
28+
requests==2.22.0
29+
rsa==4.0
30+
s3transfer==0.2.1
31+
six==1.13.0
32+
urllib3==1.25.7

pyspark/run.py

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
import findspark, os
2+
import settings
3+
4+
# Carregar Spark com Hadoop
5+
findspark.init( settings.SPARK_PATH )
6+
7+
import main
8+
main.main()

pyspark/runmr.py

-48
This file was deleted.

pyspark/settings.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
11
import os
22

3-
SPARK_PATH=os.getenv( "SPARK_PATH" , "{}{}".format(os.getcwd(), '/spark/spark-3.0.0-preview-bin-hadoop2.7/') )
3+
APP_NAME=os.getenv('APP_NAME', "Spark Hadoop")
4+
SPARK_MEM=os.getenv('SPARK_MEM', "1gb")
5+
SPARK_PATH=os.getenv( "SPARK_PATH" , "{}{}".format(os.getcwd(), '/../spark/spark-3.0.0-preview-bin-hadoop2.7/') )

pyspark/spark/README.md

-5
This file was deleted.

0 commit comments

Comments
 (0)