Skip to content

Commit beb0f50

Browse files
committed
Allowed search results for Django code terms which contain stop words.
1 parent 0070473 commit beb0f50

File tree

4 files changed

+227
-2
lines changed

4 files changed

+227
-2
lines changed

docs/models.py

+65-2
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import html
33
import json
44
import operator
5+
import re
56
from functools import reduce
67
from pathlib import Path
78

@@ -15,7 +16,7 @@
1516
TrigramSimilarity,
1617
)
1718
from django.core.cache import cache
18-
from django.db import models, transaction
19+
from django.db import models, transaction, connection
1920
from django.db.models import Prefetch, Q
2021
from django.db.models.fields.json import KeyTextTransform
2122
from django.utils.functional import cached_property
@@ -174,6 +175,18 @@ def sync_to_db(self, decoded_documents):
174175
if line.startswith(f"Disallow: /{self.lang}/{self.release_id}/")
175176
]
176177

178+
language_mapping = TSEARCH_CONFIG_LANGUAGES
179+
english = "custom_english"
180+
with connection.cursor() as cursor:
181+
cursor.execute(
182+
"SELECT EXISTS(SELECT 1 FROM pg_ts_config WHERE cfgname = %s)",
183+
[english]
184+
)
185+
has_custom_english_config = cursor.fetchone()[0]
186+
187+
if has_custom_english_config:
188+
language_mapping["en"] = english
189+
177190
for document in decoded_documents:
178191
if (
179192
"body" not in document
@@ -187,12 +200,16 @@ def sync_to_db(self, decoded_documents):
187200
document_path = _clean_document_path(document["current_page_name"])
188201
document["slug"] = Path(document_path).parts[-1]
189202
document["parents"] = " ".join(Path(document_path).parts[:-1])
203+
document["code_references"] = _generate_code_references(document["body"])
204+
document["code_references_search"] = " ".join(
205+
document["code_references"].keys()
206+
)
190207
Document.objects.create(
191208
release=self,
192209
path=document_path,
193210
title=html.unescape(strip_tags(document["title"])),
194211
metadata=document,
195-
config=TSEARCH_CONFIG_LANGUAGES.get(
212+
config=language_mapping.get(
196213
self.lang[:2], DEFAULT_TEXT_SEARCH_CONFIG
197214
),
198215
)
@@ -213,6 +230,52 @@ def _clean_document_path(path):
213230
return path
214231

215232

233+
def _generate_code_references(body):
234+
"""
235+
Django documents classes with the syntax `.. class::`.
236+
This results in the following HTML:
237+
<dl class="py class">
238+
<dt class="sig sig-object py" id="django.db.models.ManyToManyField">
239+
...
240+
</dt>
241+
</dl>
242+
This is similar for attributes (`.. attribute::`), methods etc.
243+
"""
244+
# Collect all <dt> HTML tag ids into a list, e.g:
245+
# [
246+
# 'django.db.models.Index',
247+
# 'django.db.models.Index.expressions',
248+
# 'django.db.models.Index.fields',
249+
# ...
250+
# ]
251+
code_references = list(re.findall(r'<dt[^>]+id="([^"]+)"', body))
252+
# As the search term can be "expressions", "Index.expressions" etc. create a mapping
253+
# between potential code search terms and their HTML id.
254+
# {
255+
# 'django.db.models.Index': 'django.db.models.Index',
256+
# 'Index': 'django.db.models.Index',
257+
# 'models.Index': 'django.db.models.Index',
258+
# 'db.models.Index': 'django.db.models.Index',
259+
# 'django.db.models.Index.expressions': 'django.db.models.Index.expressions',
260+
# 'expressions': 'django.db.models.Index.expressions',
261+
# 'Index.expressions': 'django.db.models.Index.expressions',
262+
# 'models.Index.expressions': 'django.db.models.Index.expressions',
263+
# 'db.models.Index.expressions': 'django.db.models.Index.expressions',
264+
# 'django.db.models.Index.fields': 'django.db.models.Index.fields',
265+
# 'fields': 'django.db.models.Index.fields',
266+
# 'Index.fields': 'django.db.models.Index.fields',
267+
# 'models.Index.fields': 'django.db.models.Index.fields',
268+
# 'db.models.Index.fields': 'django.db.models.Index.fields',
269+
# ...
270+
# }
271+
code_paths = {}
272+
for reference in code_references:
273+
code_path = reference.split(".")
274+
for i in range(len(code_path)):
275+
code_paths[".".join(code_path[-i:])] = reference
276+
return code_paths
277+
278+
216279
def document_url(doc):
217280
if doc.path:
218281
kwargs = {

docs/search.py

+5
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,11 @@
4343
SearchVector("title", weight="A", config=F("config"))
4444
+ SearchVector(KeyTextTransform("slug", "metadata"), weight="A", config=F("config"))
4545
+ SearchVector(KeyTextTransform("toc", "metadata"), weight="B", config=F("config"))
46+
+ SearchVector(
47+
KeyTextTransform("code_references_search", "metadata"),
48+
weight="B",
49+
config=F("config"),
50+
)
4651
+ SearchVector(KeyTextTransform("body", "metadata"), weight="C", config=F("config"))
4752
+ SearchVector(
4853
KeyTextTransform("parents", "metadata"), weight="D", config=F("config")

docs/stopwords/README.md

+38
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# Instructions to create a new search dictionary
2+
3+
In this folder, there is `custom_english.stop`.
4+
5+
This copies the [snowball english stop words](https://github.com/postgres/postgres/blob/master/src/backend/snowball/stopwords/english.stop)
6+
but removes some stop words such as "through" and "when". This is because these
7+
terms are also used in Django code.
8+
9+
The file format is a list of words, one per line. Blank lines and trailing
10+
spaces are ignored, and upper case is folded to lower case, but no other
11+
processing is done on the file contents.
12+
13+
This file needs to be created in `$SHAREDIR/tsearch_data/custom_english.stop`,
14+
where `$SHAREDIR` means the PostgreSQL installation's shared-data directory,
15+
available via `pg_config --sharedir`.
16+
17+
See https://www.postgresql.org/docs/current/textsearch-dictionaries.html
18+
19+
Once the custom stop words file has been created, we can run the following SQL:
20+
21+
```sql
22+
CREATE TEXT SEARCH DICTIONARY english_custom (
23+
TEMPLATE = snowball,
24+
Language = english,
25+
StopWords = english_custom
26+
);
27+
28+
CREATE TEXT SEARCH CONFIGURATION public.english_custom (
29+
COPY = pg_catalog.english
30+
);
31+
32+
ALTER TEXT SEARCH CONFIGURATION public.english_custom
33+
ALTER MAPPING
34+
FOR asciiword, asciihword, hword_asciipart, hword, hword_part, word
35+
WITH english_custom;
36+
```
37+
38+
This should then mean the `english_custom` search dictionary is available.

docs/stopwords/custom_english.stop

+119
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
i
2+
me
3+
my
4+
myself
5+
we
6+
our
7+
ours
8+
ourselves
9+
you
10+
your
11+
yours
12+
yourself
13+
yourselves
14+
he
15+
him
16+
his
17+
himself
18+
she
19+
her
20+
hers
21+
herself
22+
it
23+
its
24+
itself
25+
they
26+
them
27+
their
28+
theirs
29+
themselves
30+
what
31+
which
32+
who
33+
whom
34+
this
35+
that
36+
these
37+
those
38+
am
39+
is
40+
are
41+
was
42+
were
43+
be
44+
been
45+
being
46+
have
47+
has
48+
had
49+
having
50+
do
51+
does
52+
did
53+
doing
54+
a
55+
an
56+
the
57+
and
58+
but
59+
or
60+
because
61+
as
62+
until
63+
while
64+
of
65+
at
66+
by
67+
about
68+
against
69+
between
70+
into
71+
during
72+
before
73+
after
74+
above
75+
below
76+
to
77+
from
78+
up
79+
down
80+
in
81+
out
82+
on
83+
off
84+
over
85+
under
86+
again
87+
further
88+
then
89+
once
90+
here
91+
there
92+
where
93+
why
94+
how
95+
any
96+
both
97+
each
98+
few
99+
more
100+
most
101+
other
102+
some
103+
such
104+
no
105+
nor
106+
not
107+
own
108+
same
109+
so
110+
than
111+
too
112+
very
113+
s
114+
t
115+
can
116+
will
117+
just
118+
don
119+
should

0 commit comments

Comments
 (0)