Skip to content

Commit 5866b71

Browse files
committed
chapter 2 topic 2.1.1 completed work with corpus
1 parent 3338c47 commit 5866b71

File tree

5 files changed

+308
-0
lines changed

5 files changed

+308
-0
lines changed

README.md

+9
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,15 @@ Let's practice the **Natural language with python**.
2929
* Nested Code Block
3030
* Looping with Conditions
3131

32+
## Chapter 2 :-
33+
### Accesing Text Corpora and Lexical Resources.
34+
35+
#### 1.1 [Accessing Text Corpora](chapter_2/2.1/NLP_chapter_2.ipynb "Language Processing and Python 2.1")
36+
* Gutenberg
37+
* searching text
38+
* counting vocabulary
39+
* lexical diversity
40+
* percentage of occurance
3241

3342

3443

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
{
2+
"cells": [],
3+
"metadata": {},
4+
"nbformat": 4,
5+
"nbformat_minor": 2
6+
}

chapter_1/1.5/Untitled.ipynb

+83
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"metadata": {},
7+
"outputs": [
8+
{
9+
"name": "stdout",
10+
"output_type": "stream",
11+
"text": [
12+
"*** Introductory Examples for the NLTK Book ***\n",
13+
"Loading text1, ..., text9 and sent1, ..., sent9\n",
14+
"Type the name of the text or sentence to view it.\n",
15+
"Type: 'texts()' or 'sents()' to list the materials.\n",
16+
"text1: Moby Dick by Herman Melville 1851\n",
17+
"text2: Sense and Sensibility by Jane Austen 1811\n",
18+
"text3: The Book of Genesis\n",
19+
"text4: Inaugural Address Corpus\n",
20+
"text5: Chat Corpus\n",
21+
"text6: Monty Python and the Holy Grail\n",
22+
"text7: Wall Street Journal\n",
23+
"text8: Personals Corpus\n",
24+
"text9: The Man Who Was Thursday by G . K . Chesterton 1908\n"
25+
]
26+
}
27+
],
28+
"source": [
29+
"import nltk\n",
30+
"from nltk.book import *"
31+
]
32+
},
33+
{
34+
"cell_type": "code",
35+
"execution_count": 2,
36+
"metadata": {},
37+
"outputs": [
38+
{
39+
"ename": "NameError",
40+
"evalue": "name 'babelize_shell' is not defined",
41+
"output_type": "error",
42+
"traceback": [
43+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
44+
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
45+
"\u001b[0;32m<ipython-input-2-7420127cd0cd>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mbabelize_shell\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
46+
"\u001b[0;31mNameError\u001b[0m: name 'babelize_shell' is not defined"
47+
]
48+
}
49+
],
50+
"source": [
51+
"babelize_shell()"
52+
]
53+
},
54+
{
55+
"cell_type": "code",
56+
"execution_count": null,
57+
"metadata": {},
58+
"outputs": [],
59+
"source": []
60+
}
61+
],
62+
"metadata": {
63+
"kernelspec": {
64+
"display_name": "Python 3",
65+
"language": "python",
66+
"name": "python3"
67+
},
68+
"language_info": {
69+
"codemirror_mode": {
70+
"name": "ipython",
71+
"version": 3
72+
},
73+
"file_extension": ".py",
74+
"mimetype": "text/x-python",
75+
"name": "python",
76+
"nbconvert_exporter": "python",
77+
"pygments_lexer": "ipython3",
78+
"version": "3.7.3"
79+
}
80+
},
81+
"nbformat": 4,
82+
"nbformat_minor": 2
83+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
{
2+
"cells": [],
3+
"metadata": {},
4+
"nbformat": 4,
5+
"nbformat_minor": 2
6+
}

chapter_2/2.1/NLP_chapter_2.ipynb

+204
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# Accessing Text Corpora"
8+
]
9+
},
10+
{
11+
"cell_type": "code",
12+
"execution_count": 1,
13+
"metadata": {},
14+
"outputs": [
15+
{
16+
"name": "stdout",
17+
"output_type": "stream",
18+
"text": [
19+
"*** Introductory Examples for the NLTK Book ***\n",
20+
"Loading text1, ..., text9 and sent1, ..., sent9\n",
21+
"Type the name of the text or sentence to view it.\n",
22+
"Type: 'texts()' or 'sents()' to list the materials.\n",
23+
"text1: Moby Dick by Herman Melville 1851\n",
24+
"text2: Sense and Sensibility by Jane Austen 1811\n",
25+
"text3: The Book of Genesis\n",
26+
"text4: Inaugural Address Corpus\n",
27+
"text5: Chat Corpus\n",
28+
"text6: Monty Python and the Holy Grail\n",
29+
"text7: Wall Street Journal\n",
30+
"text8: Personals Corpus\n",
31+
"text9: The Man Who Was Thursday by G . K . Chesterton 1908\n"
32+
]
33+
}
34+
],
35+
"source": [
36+
"import nltk\n",
37+
"from nltk.book import *"
38+
]
39+
},
40+
{
41+
"cell_type": "markdown",
42+
"metadata": {},
43+
"source": [
44+
"## Gutenberg Corpus"
45+
]
46+
},
47+
{
48+
"cell_type": "markdown",
49+
"metadata": {},
50+
"source": [
51+
"contains 25000 free e-books."
52+
]
53+
},
54+
{
55+
"cell_type": "code",
56+
"execution_count": 5,
57+
"metadata": {},
58+
"outputs": [
59+
{
60+
"name": "stdout",
61+
"output_type": "stream",
62+
"text": [
63+
"['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']\n"
64+
]
65+
}
66+
],
67+
"source": [
68+
"print(nltk.corpus.gutenberg.fileids())"
69+
]
70+
},
71+
{
72+
"cell_type": "code",
73+
"execution_count": 8,
74+
"metadata": {},
75+
"outputs": [
76+
{
77+
"data": {
78+
"text/plain": [
79+
"192427"
80+
]
81+
},
82+
"execution_count": 8,
83+
"metadata": {},
84+
"output_type": "execute_result"
85+
}
86+
],
87+
"source": [
88+
"#lets choose one of them\n",
89+
"emma = nltk.corpus.gutenberg.words('austen-emma.txt')\n",
90+
"#print the length of austen-emma.txt\n",
91+
"len(emma)"
92+
]
93+
},
94+
{
95+
"cell_type": "code",
96+
"execution_count": 10,
97+
"metadata": {},
98+
"outputs": [
99+
{
100+
"name": "stdout",
101+
"output_type": "stream",
102+
"text": [
103+
"4 24 26 austen-emma.txt \n",
104+
"4 26 16 austen-persuasion.txt \n",
105+
"4 28 22 austen-sense.txt \n",
106+
"4 33 79 bible-kjv.txt \n",
107+
"4 19 5 blake-poems.txt \n",
108+
"4 19 14 bryant-stories.txt \n",
109+
"4 17 12 burgess-busterbrown.txt \n",
110+
"4 20 12 carroll-alice.txt \n",
111+
"4 20 11 chesterton-ball.txt \n",
112+
"4 22 11 chesterton-brown.txt \n",
113+
"4 18 10 chesterton-thursday.txt \n",
114+
"4 20 24 edgeworth-parents.txt \n",
115+
"4 25 15 melville-moby_dick.txt \n",
116+
"4 52 10 milton-paradise.txt \n",
117+
"4 11 8 shakespeare-caesar.txt \n",
118+
"4 12 7 shakespeare-hamlet.txt \n",
119+
"4 12 6 shakespeare-macbeth.txt \n",
120+
"4 36 12 whitman-leaves.txt \n"
121+
]
122+
}
123+
],
124+
"source": [
125+
"#print the average numbers of chars per word in every text and aveage sentance length and average frequency of vocablury\n",
126+
"for fileid in gutenberg.fileids():\n",
127+
" num_char = len(gutenberg.raw(fileid))\n",
128+
" num_words = len(gutenberg.words(fileid))\n",
129+
" num_sentance = len(gutenberg.sents(fileid))\n",
130+
" num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))\n",
131+
" print('{} {} {} {} '.format(int(num_char/num_words),int(num_words/num_sentance),int(num_words/num_vocab),fileid))"
132+
]
133+
},
134+
{
135+
"cell_type": "code",
136+
"execution_count": 11,
137+
"metadata": {},
138+
"outputs": [
139+
{
140+
"name": "stdout",
141+
"output_type": "stream",
142+
"text": [
143+
"[['[', 'The', 'Tragedie', 'of', 'Macbeth', 'by', 'William', 'Shakespeare', '1603', ']'], ['Actus', 'Primus', '.'], ['Scoena', 'Prima', '.'], ['Thunder', 'and', 'Lightning', '.'], ['Enter', 'three', 'Witches', '.'], ['1', '.'], ['When', 'shall', 'we', 'three', 'meet', 'againe', '?'], ['In', 'Thunder', ',', 'Lightning', ',', 'or', 'in', 'Raine', '?'], ['2', '.'], ['When', 'the', 'Hurley', '-', 'burley', \"'\", 's', 'done', ',', 'When', 'the', 'Battaile', \"'\", 's', 'lost', ',', 'and', 'wonne']]\n"
144+
]
145+
}
146+
],
147+
"source": [
148+
"#print first 10 sentance from shakespeare-macbeth.txt \n",
149+
"macbeth_sentance = gutenberg.sents('shakespeare-macbeth.txt')\n",
150+
"print(macbeth_sentance[:10])"
151+
]
152+
},
153+
{
154+
"cell_type": "code",
155+
"execution_count": 16,
156+
"metadata": {},
157+
"outputs": [
158+
{
159+
"name": "stdout",
160+
"output_type": "stream",
161+
"text": [
162+
"Longest length :: 158\n",
163+
"\n",
164+
"[['Doubtfull', 'it', 'stood', ',', 'As', 'two', 'spent', 'Swimmers', ',', 'that', 'doe', 'cling', 'together', ',', 'And', 'choake', 'their', 'Art', ':', 'The', 'mercilesse', 'Macdonwald', '(', 'Worthie', 'to', 'be', 'a', 'Rebell', ',', 'for', 'to', 'that', 'The', 'multiplying', 'Villanies', 'of', 'Nature', 'Doe', 'swarme', 'vpon', 'him', ')', 'from', 'the', 'Westerne', 'Isles', 'Of', 'Kernes', 'and', 'Gallowgrosses', 'is', 'supply', \"'\", 'd', ',', 'And', 'Fortune', 'on', 'his', 'damned', 'Quarry', 'smiling', ',', 'Shew', \"'\", 'd', 'like', 'a', 'Rebells', 'Whore', ':', 'but', 'all', \"'\", 's', 'too', 'weake', ':', 'For', 'braue', 'Macbeth', '(', 'well', 'hee', 'deserues', 'that', 'Name', ')', 'Disdayning', 'Fortune', ',', 'with', 'his', 'brandisht', 'Steele', ',', 'Which', 'smoak', \"'\", 'd', 'with', 'bloody', 'execution', '(', 'Like', 'Valours', 'Minion', ')', 'caru', \"'\", 'd', 'out', 'his', 'passage', ',', 'Till', 'hee', 'fac', \"'\", 'd', 'the', 'Slaue', ':', 'Which', 'neu', \"'\", 'r', 'shooke', 'hands', ',', 'nor', 'bad', 'farwell', 'to', 'him', ',', 'Till', 'he', 'vnseam', \"'\", 'd', 'him', 'from', 'the', 'Naue', 'toth', \"'\", 'Chops', ',', 'And', 'fix', \"'\", 'd', 'his', 'Head', 'vpon', 'our', 'Battlements']]\n"
165+
]
166+
}
167+
],
168+
"source": [
169+
"#print the length of longest sentance in shakespeare-macbeth.txt and corresponding sentance\n",
170+
"longest_len = max([len(s) for s in macbeth_sentance])\n",
171+
"print('Longest length :: {}\\n'.format(longest_len))\n",
172+
"print([s for s in macbeth_sentance if len(s)==longest_len])"
173+
]
174+
},
175+
{
176+
"cell_type": "code",
177+
"execution_count": null,
178+
"metadata": {},
179+
"outputs": [],
180+
"source": []
181+
}
182+
],
183+
"metadata": {
184+
"kernelspec": {
185+
"display_name": "Python 3",
186+
"language": "python",
187+
"name": "python3"
188+
},
189+
"language_info": {
190+
"codemirror_mode": {
191+
"name": "ipython",
192+
"version": 3
193+
},
194+
"file_extension": ".py",
195+
"mimetype": "text/x-python",
196+
"name": "python",
197+
"nbconvert_exporter": "python",
198+
"pygments_lexer": "ipython3",
199+
"version": "3.7.3"
200+
}
201+
},
202+
"nbformat": 4,
203+
"nbformat_minor": 2
204+
}

0 commit comments

Comments
 (0)