chapter 2 topic 2.1.1 completed work with corpus

divaymohan · divaymohan · commit 5866b710b41d · 2019-10-09T17:32:17.000+05:30
diff --git a/README.md b/README.md
@@ -29,6 +29,15 @@ Let's practice the **Natural language with python**.
  * Nested Code Block
  * Looping with Conditions
  
+## Chapter 2 :-
+### Accesing Text Corpora and Lexical Resources.
+
+#### 1.1 [Accessing Text Corpora](chapter_2/2.1/NLP_chapter_2.ipynb "Language Processing and Python 2.1")
+* Gutenberg
+* searching text
+* counting vocabulary
+* lexical diversity
+* percentage of occurance
 
 
 
diff --git a/chapter_1/1.5/.ipynb_checkpoints/Untitled-checkpoint.ipynb b/chapter_1/1.5/.ipynb_checkpoints/Untitled-checkpoint.ipynb
@@ -0,0 +1,6 @@
+{
+ "cells": [],
+ "metadata": {},
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/chapter_1/1.5/Untitled.ipynb b/chapter_1/1.5/Untitled.ipynb
@@ -0,0 +1,83 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "*** Introductory Examples for the NLTK Book ***\n",
+      "Loading text1, ..., text9 and sent1, ..., sent9\n",
+      "Type the name of the text or sentence to view it.\n",
+      "Type: 'texts()' or 'sents()' to list the materials.\n",
+      "text1: Moby Dick by Herman Melville 1851\n",
+      "text2: Sense and Sensibility by Jane Austen 1811\n",
+      "text3: The Book of Genesis\n",
+      "text4: Inaugural Address Corpus\n",
+      "text5: Chat Corpus\n",
+      "text6: Monty Python and the Holy Grail\n",
+      "text7: Wall Street Journal\n",
+      "text8: Personals Corpus\n",
+      "text9: The Man Who Was Thursday by G . K . Chesterton 1908\n"
+     ]
+    }
+   ],
+   "source": [
+    "import nltk\n",
+    "from nltk.book import *"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'babelize_shell' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-2-7420127cd0cd>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mbabelize_shell\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m: name 'babelize_shell' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "babelize_shell()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/chapter_2/2.1/.ipynb_checkpoints/NLP_chapter_2-checkpoint.ipynb b/chapter_2/2.1/.ipynb_checkpoints/NLP_chapter_2-checkpoint.ipynb
@@ -0,0 +1,6 @@
+{
+ "cells": [],
+ "metadata": {},
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/chapter_2/2.1/NLP_chapter_2.ipynb b/chapter_2/2.1/NLP_chapter_2.ipynb
@@ -0,0 +1,204 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Accessing Text Corpora"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "*** Introductory Examples for the NLTK Book ***\n",
+      "Loading text1, ..., text9 and sent1, ..., sent9\n",
+      "Type the name of the text or sentence to view it.\n",
+      "Type: 'texts()' or 'sents()' to list the materials.\n",
+      "text1: Moby Dick by Herman Melville 1851\n",
+      "text2: Sense and Sensibility by Jane Austen 1811\n",
+      "text3: The Book of Genesis\n",
+      "text4: Inaugural Address Corpus\n",
+      "text5: Chat Corpus\n",
+      "text6: Monty Python and the Holy Grail\n",
+      "text7: Wall Street Journal\n",
+      "text8: Personals Corpus\n",
+      "text9: The Man Who Was Thursday by G . K . Chesterton 1908\n"
+     ]
+    }
+   ],
+   "source": [
+    "import nltk\n",
+    "from nltk.book import *"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Gutenberg Corpus"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "contains 25000 free e-books."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(nltk.corpus.gutenberg.fileids())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "192427"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#lets choose one of them\n",
+    "emma = nltk.corpus.gutenberg.words('austen-emma.txt')\n",
+    "#print the length of austen-emma.txt\n",
+    "len(emma)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "4 24 26 austen-emma.txt \n",
+      "4 26 16 austen-persuasion.txt \n",
+      "4 28 22 austen-sense.txt \n",
+      "4 33 79 bible-kjv.txt \n",
+      "4 19 5 blake-poems.txt \n",
+      "4 19 14 bryant-stories.txt \n",
+      "4 17 12 burgess-busterbrown.txt \n",
+      "4 20 12 carroll-alice.txt \n",
+      "4 20 11 chesterton-ball.txt \n",
+      "4 22 11 chesterton-brown.txt \n",
+      "4 18 10 chesterton-thursday.txt \n",
+      "4 20 24 edgeworth-parents.txt \n",
+      "4 25 15 melville-moby_dick.txt \n",
+      "4 52 10 milton-paradise.txt \n",
+      "4 11 8 shakespeare-caesar.txt \n",
+      "4 12 7 shakespeare-hamlet.txt \n",
+      "4 12 6 shakespeare-macbeth.txt \n",
+      "4 36 12 whitman-leaves.txt \n"
+     ]
+    }
+   ],
+   "source": [
+    "#print the average numbers of chars per word in every text and aveage sentance length and average frequency of vocablury\n",
+    "for fileid in gutenberg.fileids():\n",
+    "    num_char = len(gutenberg.raw(fileid))\n",
+    "    num_words = len(gutenberg.words(fileid))\n",
+    "    num_sentance = len(gutenberg.sents(fileid))\n",
+    "    num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))\n",
+    "    print('{} {} {} {} '.format(int(num_char/num_words),int(num_words/num_sentance),int(num_words/num_vocab),fileid))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[['[', 'The', 'Tragedie', 'of', 'Macbeth', 'by', 'William', 'Shakespeare', '1603', ']'], ['Actus', 'Primus', '.'], ['Scoena', 'Prima', '.'], ['Thunder', 'and', 'Lightning', '.'], ['Enter', 'three', 'Witches', '.'], ['1', '.'], ['When', 'shall', 'we', 'three', 'meet', 'againe', '?'], ['In', 'Thunder', ',', 'Lightning', ',', 'or', 'in', 'Raine', '?'], ['2', '.'], ['When', 'the', 'Hurley', '-', 'burley', \"'\", 's', 'done', ',', 'When', 'the', 'Battaile', \"'\", 's', 'lost', ',', 'and', 'wonne']]\n"
+     ]
+    }
+   ],
+   "source": [
+    "#print first 10 sentance from shakespeare-macbeth.txt \n",
+    "macbeth_sentance = gutenberg.sents('shakespeare-macbeth.txt')\n",
+    "print(macbeth_sentance[:10])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Longest length :: 158\n",
+      "\n",
+      "[['Doubtfull', 'it', 'stood', ',', 'As', 'two', 'spent', 'Swimmers', ',', 'that', 'doe', 'cling', 'together', ',', 'And', 'choake', 'their', 'Art', ':', 'The', 'mercilesse', 'Macdonwald', '(', 'Worthie', 'to', 'be', 'a', 'Rebell', ',', 'for', 'to', 'that', 'The', 'multiplying', 'Villanies', 'of', 'Nature', 'Doe', 'swarme', 'vpon', 'him', ')', 'from', 'the', 'Westerne', 'Isles', 'Of', 'Kernes', 'and', 'Gallowgrosses', 'is', 'supply', \"'\", 'd', ',', 'And', 'Fortune', 'on', 'his', 'damned', 'Quarry', 'smiling', ',', 'Shew', \"'\", 'd', 'like', 'a', 'Rebells', 'Whore', ':', 'but', 'all', \"'\", 's', 'too', 'weake', ':', 'For', 'braue', 'Macbeth', '(', 'well', 'hee', 'deserues', 'that', 'Name', ')', 'Disdayning', 'Fortune', ',', 'with', 'his', 'brandisht', 'Steele', ',', 'Which', 'smoak', \"'\", 'd', 'with', 'bloody', 'execution', '(', 'Like', 'Valours', 'Minion', ')', 'caru', \"'\", 'd', 'out', 'his', 'passage', ',', 'Till', 'hee', 'fac', \"'\", 'd', 'the', 'Slaue', ':', 'Which', 'neu', \"'\", 'r', 'shooke', 'hands', ',', 'nor', 'bad', 'farwell', 'to', 'him', ',', 'Till', 'he', 'vnseam', \"'\", 'd', 'him', 'from', 'the', 'Naue', 'toth', \"'\", 'Chops', ',', 'And', 'fix', \"'\", 'd', 'his', 'Head', 'vpon', 'our', 'Battlements']]\n"
+     ]
+    }
+   ],
+   "source": [
+    "#print the length of longest sentance in shakespeare-macbeth.txt and corresponding sentance\n",
+    "longest_len = max([len(s) for s in macbeth_sentance])\n",
+    "print('Longest length :: {}\\n'.format(longest_len))\n",
+    "print([s for s in macbeth_sentance if len(s)==longest_len])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}