|
| 1 | +{ |
| 2 | + "cells": [ |
| 3 | + { |
| 4 | + "cell_type": "markdown", |
| 5 | + "metadata": {}, |
| 6 | + "source": [ |
| 7 | + "# Accessing Text Corpora" |
| 8 | + ] |
| 9 | + }, |
| 10 | + { |
| 11 | + "cell_type": "code", |
| 12 | + "execution_count": 1, |
| 13 | + "metadata": {}, |
| 14 | + "outputs": [ |
| 15 | + { |
| 16 | + "name": "stdout", |
| 17 | + "output_type": "stream", |
| 18 | + "text": [ |
| 19 | + "*** Introductory Examples for the NLTK Book ***\n", |
| 20 | + "Loading text1, ..., text9 and sent1, ..., sent9\n", |
| 21 | + "Type the name of the text or sentence to view it.\n", |
| 22 | + "Type: 'texts()' or 'sents()' to list the materials.\n", |
| 23 | + "text1: Moby Dick by Herman Melville 1851\n", |
| 24 | + "text2: Sense and Sensibility by Jane Austen 1811\n", |
| 25 | + "text3: The Book of Genesis\n", |
| 26 | + "text4: Inaugural Address Corpus\n", |
| 27 | + "text5: Chat Corpus\n", |
| 28 | + "text6: Monty Python and the Holy Grail\n", |
| 29 | + "text7: Wall Street Journal\n", |
| 30 | + "text8: Personals Corpus\n", |
| 31 | + "text9: The Man Who Was Thursday by G . K . Chesterton 1908\n" |
| 32 | + ] |
| 33 | + } |
| 34 | + ], |
| 35 | + "source": [ |
| 36 | + "import nltk\n", |
| 37 | + "from nltk.book import *" |
| 38 | + ] |
| 39 | + }, |
| 40 | + { |
| 41 | + "cell_type": "markdown", |
| 42 | + "metadata": {}, |
| 43 | + "source": [ |
| 44 | + "## Gutenberg Corpus" |
| 45 | + ] |
| 46 | + }, |
| 47 | + { |
| 48 | + "cell_type": "markdown", |
| 49 | + "metadata": {}, |
| 50 | + "source": [ |
| 51 | + "contains 25000 free e-books." |
| 52 | + ] |
| 53 | + }, |
| 54 | + { |
| 55 | + "cell_type": "code", |
| 56 | + "execution_count": 5, |
| 57 | + "metadata": {}, |
| 58 | + "outputs": [ |
| 59 | + { |
| 60 | + "name": "stdout", |
| 61 | + "output_type": "stream", |
| 62 | + "text": [ |
| 63 | + "['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']\n" |
| 64 | + ] |
| 65 | + } |
| 66 | + ], |
| 67 | + "source": [ |
| 68 | + "print(nltk.corpus.gutenberg.fileids())" |
| 69 | + ] |
| 70 | + }, |
| 71 | + { |
| 72 | + "cell_type": "code", |
| 73 | + "execution_count": 8, |
| 74 | + "metadata": {}, |
| 75 | + "outputs": [ |
| 76 | + { |
| 77 | + "data": { |
| 78 | + "text/plain": [ |
| 79 | + "192427" |
| 80 | + ] |
| 81 | + }, |
| 82 | + "execution_count": 8, |
| 83 | + "metadata": {}, |
| 84 | + "output_type": "execute_result" |
| 85 | + } |
| 86 | + ], |
| 87 | + "source": [ |
| 88 | + "#lets choose one of them\n", |
| 89 | + "emma = nltk.corpus.gutenberg.words('austen-emma.txt')\n", |
| 90 | + "#print the length of austen-emma.txt\n", |
| 91 | + "len(emma)" |
| 92 | + ] |
| 93 | + }, |
| 94 | + { |
| 95 | + "cell_type": "code", |
| 96 | + "execution_count": 10, |
| 97 | + "metadata": {}, |
| 98 | + "outputs": [ |
| 99 | + { |
| 100 | + "name": "stdout", |
| 101 | + "output_type": "stream", |
| 102 | + "text": [ |
| 103 | + "4 24 26 austen-emma.txt \n", |
| 104 | + "4 26 16 austen-persuasion.txt \n", |
| 105 | + "4 28 22 austen-sense.txt \n", |
| 106 | + "4 33 79 bible-kjv.txt \n", |
| 107 | + "4 19 5 blake-poems.txt \n", |
| 108 | + "4 19 14 bryant-stories.txt \n", |
| 109 | + "4 17 12 burgess-busterbrown.txt \n", |
| 110 | + "4 20 12 carroll-alice.txt \n", |
| 111 | + "4 20 11 chesterton-ball.txt \n", |
| 112 | + "4 22 11 chesterton-brown.txt \n", |
| 113 | + "4 18 10 chesterton-thursday.txt \n", |
| 114 | + "4 20 24 edgeworth-parents.txt \n", |
| 115 | + "4 25 15 melville-moby_dick.txt \n", |
| 116 | + "4 52 10 milton-paradise.txt \n", |
| 117 | + "4 11 8 shakespeare-caesar.txt \n", |
| 118 | + "4 12 7 shakespeare-hamlet.txt \n", |
| 119 | + "4 12 6 shakespeare-macbeth.txt \n", |
| 120 | + "4 36 12 whitman-leaves.txt \n" |
| 121 | + ] |
| 122 | + } |
| 123 | + ], |
| 124 | + "source": [ |
| 125 | + "#print the average numbers of chars per word in every text and aveage sentance length and average frequency of vocablury\n", |
| 126 | + "for fileid in gutenberg.fileids():\n", |
| 127 | + " num_char = len(gutenberg.raw(fileid))\n", |
| 128 | + " num_words = len(gutenberg.words(fileid))\n", |
| 129 | + " num_sentance = len(gutenberg.sents(fileid))\n", |
| 130 | + " num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))\n", |
| 131 | + " print('{} {} {} {} '.format(int(num_char/num_words),int(num_words/num_sentance),int(num_words/num_vocab),fileid))" |
| 132 | + ] |
| 133 | + }, |
| 134 | + { |
| 135 | + "cell_type": "code", |
| 136 | + "execution_count": 11, |
| 137 | + "metadata": {}, |
| 138 | + "outputs": [ |
| 139 | + { |
| 140 | + "name": "stdout", |
| 141 | + "output_type": "stream", |
| 142 | + "text": [ |
| 143 | + "[['[', 'The', 'Tragedie', 'of', 'Macbeth', 'by', 'William', 'Shakespeare', '1603', ']'], ['Actus', 'Primus', '.'], ['Scoena', 'Prima', '.'], ['Thunder', 'and', 'Lightning', '.'], ['Enter', 'three', 'Witches', '.'], ['1', '.'], ['When', 'shall', 'we', 'three', 'meet', 'againe', '?'], ['In', 'Thunder', ',', 'Lightning', ',', 'or', 'in', 'Raine', '?'], ['2', '.'], ['When', 'the', 'Hurley', '-', 'burley', \"'\", 's', 'done', ',', 'When', 'the', 'Battaile', \"'\", 's', 'lost', ',', 'and', 'wonne']]\n" |
| 144 | + ] |
| 145 | + } |
| 146 | + ], |
| 147 | + "source": [ |
| 148 | + "#print first 10 sentance from shakespeare-macbeth.txt \n", |
| 149 | + "macbeth_sentance = gutenberg.sents('shakespeare-macbeth.txt')\n", |
| 150 | + "print(macbeth_sentance[:10])" |
| 151 | + ] |
| 152 | + }, |
| 153 | + { |
| 154 | + "cell_type": "code", |
| 155 | + "execution_count": 16, |
| 156 | + "metadata": {}, |
| 157 | + "outputs": [ |
| 158 | + { |
| 159 | + "name": "stdout", |
| 160 | + "output_type": "stream", |
| 161 | + "text": [ |
| 162 | + "Longest length :: 158\n", |
| 163 | + "\n", |
| 164 | + "[['Doubtfull', 'it', 'stood', ',', 'As', 'two', 'spent', 'Swimmers', ',', 'that', 'doe', 'cling', 'together', ',', 'And', 'choake', 'their', 'Art', ':', 'The', 'mercilesse', 'Macdonwald', '(', 'Worthie', 'to', 'be', 'a', 'Rebell', ',', 'for', 'to', 'that', 'The', 'multiplying', 'Villanies', 'of', 'Nature', 'Doe', 'swarme', 'vpon', 'him', ')', 'from', 'the', 'Westerne', 'Isles', 'Of', 'Kernes', 'and', 'Gallowgrosses', 'is', 'supply', \"'\", 'd', ',', 'And', 'Fortune', 'on', 'his', 'damned', 'Quarry', 'smiling', ',', 'Shew', \"'\", 'd', 'like', 'a', 'Rebells', 'Whore', ':', 'but', 'all', \"'\", 's', 'too', 'weake', ':', 'For', 'braue', 'Macbeth', '(', 'well', 'hee', 'deserues', 'that', 'Name', ')', 'Disdayning', 'Fortune', ',', 'with', 'his', 'brandisht', 'Steele', ',', 'Which', 'smoak', \"'\", 'd', 'with', 'bloody', 'execution', '(', 'Like', 'Valours', 'Minion', ')', 'caru', \"'\", 'd', 'out', 'his', 'passage', ',', 'Till', 'hee', 'fac', \"'\", 'd', 'the', 'Slaue', ':', 'Which', 'neu', \"'\", 'r', 'shooke', 'hands', ',', 'nor', 'bad', 'farwell', 'to', 'him', ',', 'Till', 'he', 'vnseam', \"'\", 'd', 'him', 'from', 'the', 'Naue', 'toth', \"'\", 'Chops', ',', 'And', 'fix', \"'\", 'd', 'his', 'Head', 'vpon', 'our', 'Battlements']]\n" |
| 165 | + ] |
| 166 | + } |
| 167 | + ], |
| 168 | + "source": [ |
| 169 | + "#print the length of longest sentance in shakespeare-macbeth.txt and corresponding sentance\n", |
| 170 | + "longest_len = max([len(s) for s in macbeth_sentance])\n", |
| 171 | + "print('Longest length :: {}\\n'.format(longest_len))\n", |
| 172 | + "print([s for s in macbeth_sentance if len(s)==longest_len])" |
| 173 | + ] |
| 174 | + }, |
| 175 | + { |
| 176 | + "cell_type": "code", |
| 177 | + "execution_count": null, |
| 178 | + "metadata": {}, |
| 179 | + "outputs": [], |
| 180 | + "source": [] |
| 181 | + } |
| 182 | + ], |
| 183 | + "metadata": { |
| 184 | + "kernelspec": { |
| 185 | + "display_name": "Python 3", |
| 186 | + "language": "python", |
| 187 | + "name": "python3" |
| 188 | + }, |
| 189 | + "language_info": { |
| 190 | + "codemirror_mode": { |
| 191 | + "name": "ipython", |
| 192 | + "version": 3 |
| 193 | + }, |
| 194 | + "file_extension": ".py", |
| 195 | + "mimetype": "text/x-python", |
| 196 | + "name": "python", |
| 197 | + "nbconvert_exporter": "python", |
| 198 | + "pygments_lexer": "ipython3", |
| 199 | + "version": "3.7.3" |
| 200 | + } |
| 201 | + }, |
| 202 | + "nbformat": 4, |
| 203 | + "nbformat_minor": 2 |
| 204 | +} |
0 commit comments