-
Notifications
You must be signed in to change notification settings - Fork 7
/
mapper-ngrams.py
45 lines (39 loc) · 1.33 KB
/
mapper-ngrams.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
#!/usr/bin/env python
# Code imported from
# https://dbaumgartel.wordpress.com/2014/04/10/an-elastic-mapreduce-streaming-example-with-python-and-ngrams-on-aws/
import sys
def CleanWord(aword):
"""
Function input: A string which is meant to be
interpreted as a single word.
Output: a clean, lower-case version of the word
"""
# Make Lower Case
aword = aword.lower()
# Remvoe special characters from word
for character in '.,;:\'?':
aword = aword.replace(character,'')
# No empty words
if len(aword)==0:
return None
# Restrict word to the standard english alphabet
for character in aword:
if character not in 'abcdefghijklmnopqrstuvwxyz':
return None
# return the word
return aword
# Now we loop over lines in the system input
for line in sys.stdin:
# Strip the line of whitespace and split into a list
line = line.strip().split()
# Use CleanWord function to clean up the word
word = CleanWord(line[0])
# If CleanWord didn't return a string, move on
if word == None:
continue
# Get the year and the number of occurrences from
# the ngram line
year = int(line[1])
occurrences = int(line[2])
# Print the output: word, year, and number of occurrences
print '%s\t%s\t%s' % (word, year,occurrences)