-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathuk.py
More file actions
120 lines (97 loc) · 3.42 KB
/
uk.py
File metadata and controls
120 lines (97 loc) · 3.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
from glob import glob
import logging
from datetime import datetime
from django.conf import settings
from ianalyzer_readers.extract import Constant, CSV
from addcorpus.python_corpora.corpus import CSVCorpusDefinition
from corpora.parliament.parliament import Parliament
import corpora.parliament.utils.field_defaults as field_defaults
from corpora.utils.constants import document_context
def format_debate_title(title):
if title.endswith('.'):
title = title[:-1]
return title.title()
def format_house(house):
if 'commons_wmhall' in house.lower():
return 'House of Commons - Westminster Hall'
elif 'commons' in house.lower():
return 'House of Commons'
elif 'lords' in house.lower():
return 'House of Lords'
def format_speaker(speaker):
if speaker:
if speaker.startswith('*'):
speaker = speaker[1:]
return speaker.title()
class ParliamentUK(Parliament, CSVCorpusDefinition):
title = 'People & Parliament (UK)'
description = "Speeches from the House of Lords and House of Commons"
data_directory = settings.PP_UK_DATA
min_date = datetime(year=1803, month=1, day=1)
max_date = datetime(year=2021, month=12, day=31)
es_index = getattr(settings, 'PP_UK_INDEX', 'parliament-uk')
image = 'uk.jpeg'
word_model_path = getattr(settings, 'PP_UK_WM', None)
languages = ['en']
description_page = 'uk.md'
field_entry = 'speech_id'
document_context = document_context()
def sources(self, start, end):
logger = logging.getLogger('indexing')
for csv_file in glob('{}/*.csv'.format(self.data_directory)):
yield csv_file, {}
chamber = field_defaults.chamber()
chamber.extractor = CSV(
'house',
transform=format_house
)
chamber.search_filter.option_count = 3
country = field_defaults.country()
country.extractor = Constant(
value='United Kingdom'
)
date = field_defaults.date()
date.extractor = CSV('date')
debate_title = field_defaults.debate_title()
debate_title.extractor = CSV(
'debate',
transform=format_debate_title
)
debate_title.language = 'en'
debate_id = field_defaults.debate_id()
debate_id.extractor = CSV('debate_id')
speech = field_defaults.speech(language='en')
speech.extractor = CSV(
'content',
multiple=True,
transform=lambda x : ' '.join(x)
)
speech_id = field_defaults.speech_id()
speech_id.extractor = CSV('speech_id')
speech_type = field_defaults.speech_type()
speech_type.extractor = CSV('speech_type')
speaker = field_defaults.speaker()
speaker.extractor = CSV(
'speaker_name',
transform=format_speaker
)
speaker_id = field_defaults.speaker_id()
speaker_id.extractor = CSV('speaker_id')
topic = field_defaults.topic()
topic.extractor = CSV('heading_major',)
topic.language = 'en'
subtopic = field_defaults.subtopic()
subtopic.extractor = CSV('heading_minor')
subtopic.language = 'en'
sequence = field_defaults.sequence()
sequence.extractor = CSV('sequence')
def __init__(self):
self.fields = [
self.country, self.date,
self.debate_title, self.debate_id,
self.topic, self.subtopic,
self.chamber,
self.speech, self.speech_id, self.speech_type,
self.sequence,
self.speaker, self.speaker_id,
]