From 735bf7f4186f160799fc383652c5690dacf720e9 Mon Sep 17 00:00:00 2001 From: Robert Birke Date: Mon, 28 Oct 2024 09:42:22 +0100 Subject: [PATCH 1/7] Added parse_url() entrypoint --- bibtexparser/__init__.py | 1 + bibtexparser/entrypoint.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/bibtexparser/__init__.py b/bibtexparser/__init__.py index 8239d3d..31609ca 100644 --- a/bibtexparser/__init__.py +++ b/bibtexparser/__init__.py @@ -3,6 +3,7 @@ import bibtexparser.model from bibtexparser.entrypoint import parse_file from bibtexparser.entrypoint import parse_string +from bibtexparser.entrypoint import parse_url from bibtexparser.entrypoint import write_file from bibtexparser.entrypoint import write_string from bibtexparser.library import Library diff --git a/bibtexparser/entrypoint.py b/bibtexparser/entrypoint.py index f1bad67..8c42949 100644 --- a/bibtexparser/entrypoint.py +++ b/bibtexparser/entrypoint.py @@ -132,6 +132,34 @@ def parse_file( bibtex_str, parse_stack=parse_stack, append_middleware=append_middleware ) +def parse_url( + url: str, + parse_stack: Optional[Iterable[Middleware]] = None, + append_middleware: Optional[Iterable[Middleware]] = None, + encoding: str = "UTF-8", +) -> Library: + """Parse a BibTeX file from an URL + + :param url: Url to BibTeX file + :param parse_stack: + List of middleware to apply to the database after splitting. + If ``None`` (default), a default stack will be used providing simple standard functionality. + + :param append_middleware: + List of middleware to append to the default stack + (ignored if a not-``None`` parse_stack is passed). + + :param encoding: Encoding of the .bib file. Default encoding is ``"UTF-8"``. + :return: Library: Parsed BibTeX library + """ + import urllib + + with urllib.request.urlopen(url) as f: + bibtex_str = f.read().decode(encoding) + return parse_string( + bibtex_str, parse_stack=parse_stack, append_middleware=append_middleware + ) + def write_file( file: Union[str, TextIO], From 873b4096a88e88a217d9657e32b6adb4c79b6c4d Mon Sep 17 00:00:00 2001 From: Robert Birke Date: Mon, 28 Oct 2024 10:36:05 +0100 Subject: [PATCH 2/7] Fixed import --- bibtexparser/entrypoint.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/bibtexparser/entrypoint.py b/bibtexparser/entrypoint.py index 8c42949..cbd3db5 100644 --- a/bibtexparser/entrypoint.py +++ b/bibtexparser/entrypoint.py @@ -152,7 +152,7 @@ def parse_url( :param encoding: Encoding of the .bib file. Default encoding is ``"UTF-8"``. :return: Library: Parsed BibTeX library """ - import urllib + import urllib.request with urllib.request.urlopen(url) as f: bibtex_str = f.read().decode(encoding) @@ -160,7 +160,6 @@ def parse_url( bibtex_str, parse_stack=parse_stack, append_middleware=append_middleware ) - def write_file( file: Union[str, TextIO], library: Library, From fcba5adc14f4312d17da80560ea6bfa9a7377079 Mon Sep 17 00:00:00 2001 From: Robert Birke Date: Mon, 28 Oct 2024 10:37:33 +0100 Subject: [PATCH 3/7] Added SeparateCSVLists middleware to split fields with CSV lists --- bibtexparser/middlewares/__init__.py | 1 + bibtexparser/middlewares/lists.py | 61 ++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) create mode 100644 bibtexparser/middlewares/lists.py diff --git a/bibtexparser/middlewares/__init__.py b/bibtexparser/middlewares/__init__.py index 3fd1a36..7f23253 100644 --- a/bibtexparser/middlewares/__init__.py +++ b/bibtexparser/middlewares/__init__.py @@ -4,6 +4,7 @@ from bibtexparser.middlewares.interpolate import ResolveStringReferencesMiddleware from bibtexparser.middlewares.latex_encoding import LatexDecodingMiddleware from bibtexparser.middlewares.latex_encoding import LatexEncodingMiddleware +from bibtexparser.middlewares.lists import SeparateCSVLists from bibtexparser.middlewares.middleware import BlockMiddleware from bibtexparser.middlewares.middleware import LibraryMiddleware from bibtexparser.middlewares.month import MonthAbbreviationMiddleware diff --git a/bibtexparser/middlewares/lists.py b/bibtexparser/middlewares/lists.py new file mode 100644 index 0000000..e8b4ae3 --- /dev/null +++ b/bibtexparser/middlewares/lists.py @@ -0,0 +1,61 @@ +import abc +from typing import List, Literal, Tuple + +from bibtexparser.model import Block, Entry, Field + +from .middleware import BlockMiddleware + +class _ListTransformerMiddleware(BlockMiddleware, abc.ABC): + """Internal utility class - superclass for all name-transforming middlewares. + + :param allow_inplace_modification: See corresponding property. + :param name_fields: The fields that contain names, considered by this middleware.""" + + def __init__( + self, + allow_inplace_modification: bool = True, + field_names: Tuple[str] = (), + ): + super().__init__( + allow_inplace_modification=allow_inplace_modification, + allow_parallel_execution=True, + ) + self._field_names = field_names + + @property + def field_names(self) -> Tuple[str]: + """The fields that contain names, considered by this middleware.""" + return self._field_names + + @abc.abstractmethod + def _transform_field_value(self, name): + raise NotImplementedError("called abstract method") + + # docstr-coverage: inherited + def transform_entry(self, entry: Entry, *args, **kwargs) -> Block: + field: Field + + for field in entry.fields: + if field.key in self.field_names: + field.value = self._transform_field_value(field.value) + return entry + + +def split_comma_separated_list(string): + """Helper function to split a list of comma separated values.""" + import re + pattern = re.compile(r'\s*,\s*') # Remove extra spaces before and after comma + return re.sub(pattern, ',', string).split(",") + + +class SeparateCSVLists(_ListTransformerMiddleware): + """Middleware to separate comma-separated values in fields.""" + + # docstr-coverage: inherited + @classmethod + def metadata_key(cls) -> str: + return "separate_lists" + + # docstr-coverage: inherited + def _transform_field_value(self, string) -> List[str]: + return split_comma_separated_list(string) \ No newline at end of file From ae74c4b31dff4e4c8f77b85fb8c97c5bcb9b533a Mon Sep 17 00:00:00 2001 From: Robert Birke Date: Mon, 28 Oct 2024 10:52:07 +0100 Subject: [PATCH 4/7] Added SortBlocksByYearMonthDayMiddleware to sort blocks by time --- bibtexparser/middlewares/__init__.py | 1 + bibtexparser/middlewares/sorting_blocks.py | 93 ++++++++++++++++++++++ 2 files changed, 94 insertions(+) diff --git a/bibtexparser/middlewares/__init__.py b/bibtexparser/middlewares/__init__.py index 7f23253..c7ac7c6 100644 --- a/bibtexparser/middlewares/__init__.py +++ b/bibtexparser/middlewares/__init__.py @@ -16,6 +16,7 @@ from bibtexparser.middlewares.names import SeparateCoAuthors from bibtexparser.middlewares.names import SplitNameParts from bibtexparser.middlewares.sorting_blocks import SortBlocksByTypeAndKeyMiddleware +from bibtexparser.middlewares.sorting_blocks import SortBlocksByYearMonthDayMiddleware from bibtexparser.middlewares.sorting_entry_fields import SortFieldsAlphabeticallyMiddleware from bibtexparser.middlewares.sorting_entry_fields import SortFieldsCustomMiddleware diff --git a/bibtexparser/middlewares/sorting_blocks.py b/bibtexparser/middlewares/sorting_blocks.py index 5ff5f13..bc5bf60 100644 --- a/bibtexparser/middlewares/sorting_blocks.py +++ b/bibtexparser/middlewares/sorting_blocks.py @@ -120,3 +120,96 @@ def _sort_key(block: Block): blocks.sort(key=_sort_key) return Library(blocks=blocks) + + +class SortBlocksByYearMonthDayMiddleware(LibraryMiddleware): + """Sorts the blocks of a library by year, month and day. + + :param descending: uses descending ordering (ascending by default) + :param preserve_comments_on_top: comments remain above same block (default True) + """ + + def __init__( + self, + preserve_comments_on_top: bool = True, + descending = False + ): + self._preserve_comments_on_top = preserve_comments_on_top + self._descending = descending + + # In-place modification is not yet supported, we make this explicit here, + super().__init__(allow_inplace_modification=False) + + @staticmethod + # Sort blocks by year and month (default 0 in case entry has no year or month) + # Month should be an integer (recommended to use MonthIntMiddleware beforehand) + def _sort_key(block: Block): + month = 0 + year = 0 + day = 0 + try: + try: + v = block.fields_dict["day"].value + if isinstance(v, str) and v.isdigit(): + v = int(v) + if isinstance(v, int): + if v >= 1 or v <= 31: + day = v + except KeyError: + # No year field + pass + try: + v = block.fields_dict["month"].value + if isinstance(v, str) and v.isdigit(): + v = int(v) + if isinstance(v, int): + if v >= 1 or v <= 12: + month = v + except KeyError: + # No month field + pass + try: + year = int(block.fields_dict["year"].value) + except KeyError: + # No year field + pass + except AttributeError: + # No fiedlds_dict (e.g. Comments) + pass + return year, month, day + + # docstr-coverage: inherited + def transform(self, library: Library) -> Library: + blocks = deepcopy(library.blocks) + + if self._preserve_comments_on_top: + # We start creating a new list of block_junks (made of comments and entries) + block_junks = [] + current_junk = _BlockJunk() + for block in blocks: + current_junk.blocks.append(block) + current_junk.sort_key = self._sort_key(block) + + if not ( + isinstance(block, ExplicitComment) or isinstance(block, ImplicitComment) + ): + # We added a non-comment block, hence we finish the junk and + # start a new one + block_junks.append(current_junk) + current_junk = _BlockJunk() + + if current_junk.blocks: + # That would be a junk with only comments, but we add it at the end for completeness + block_junks.append(current_junk) + + def _sort_key(block_junk): + return block_junk.sort_key + + block_junks.sort(key=_sort_key, reverse=self._descending) + return Library( + blocks=[block for block_junk in block_junks for block in block_junk.blocks] + ) + + else: + blocks.sort(key=self._sort_key) + return Library(blocks=blocks) From 1c6b97081d527b43a094656c54a95045dd71bd06 Mon Sep 17 00:00:00 2001 From: Robert Birke Date: Mon, 28 Oct 2024 17:26:06 +0100 Subject: [PATCH 5/7] Corrected typo --- bibtexparser/middlewares/sorting_blocks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bibtexparser/middlewares/sorting_blocks.py b/bibtexparser/middlewares/sorting_blocks.py index bc5bf60..1da317c 100644 --- a/bibtexparser/middlewares/sorting_blocks.py +++ b/bibtexparser/middlewares/sorting_blocks.py @@ -174,7 +174,7 @@ def _sort_key(block: Block): # No year field pass except AttributeError: - # No fiedlds_dict (e.g. Comments) + # No fields_dict (e.g. Comments) pass return year, month, day From 485eef4003f5d79468b2149d9f5e14f70c05df92 Mon Sep 17 00:00:00 2001 From: Robert Birke Date: Mon, 28 Oct 2024 17:26:33 +0100 Subject: [PATCH 6/7] Added filter() function --- bibtexparser/library.py | 43 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/bibtexparser/library.py b/bibtexparser/library.py index fecd1bc..d000b72 100644 --- a/bibtexparser/library.py +++ b/bibtexparser/library.py @@ -6,11 +6,13 @@ from .model import DuplicateBlockKeyBlock from .model import Entry from .model import ExplicitComment +from .model import Field from .model import ImplicitComment from .model import ParsingFailedBlock from .model import Preamble from .model import String + # TODO Use functools.lru_cache for library properties (which create lists when called) @@ -195,3 +197,44 @@ def comments(self) -> List[Union[ExplicitComment, ImplicitComment]]: return [ block for block in self._blocks if isinstance(block, (ExplicitComment, ImplicitComment)) ] + + def filter(self, + filter: Dict, + case_sensitive = False + ) -> List[Entry]: + """ Return filtered list of entries. Filter is a dict.""" + entries = [] + + + # Transform List in set + for k in filter.keys(): + if not isinstance(filter[k], set): + if isinstance(filter[k], List): + filter[k] = set([x.lower() if not case_sensitive and isinstance(x, str) else x for x in filter[k]]) + else: + x = filter[k].lower() if not case_sensitive and isinstance(filter[k], str) else filter[k] + filter[k] = set([x]) + + for block in self._blocks: + if isinstance(block, Entry): + found = True + for key in filter.keys(): + if key in block.fields_dict.keys(): + if isinstance(block.fields_dict[key], Field): + if isinstance(block.fields_dict[key].value, List): + bset = set([x.lower() if not case_sensitive and isinstance(x, str) else x for x in block.fields_dict[key].value]) + else: + x = block.fields_dict[key].value.lower() if not case_sensitive and isinstance(block.fields_dict[key].value, str) else block.fields_dict[key].value + bset = set([x]) + + if not set(bset).intersection(filter[key]): + found = False + break + else: + found = False + else: + found = False + if found: + entries.append(block) + + return entries From b84d25940b67d3fb194c3f4f4ed60f0eb4d0b9ef Mon Sep 17 00:00:00 2001 From: Robert Birke Date: Tue, 29 Oct 2024 10:52:33 +0100 Subject: [PATCH 7/7] Added duplicate_blocks as library property --- bibtexparser/library.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/bibtexparser/library.py b/bibtexparser/library.py index d000b72..03e5279 100644 --- a/bibtexparser/library.py +++ b/bibtexparser/library.py @@ -164,6 +164,11 @@ def failed_blocks(self) -> List[ParsingFailedBlock]: """All blocks that could not be parsed, preserving order of insertion.""" return [b for b in self._blocks if isinstance(b, ParsingFailedBlock)] + @property + def duplicate_blocks(self) -> List[DuplicateBlockKeyBlock]: + """All blocks that could not be parsed, preserving order of insertion.""" + return [b for b in self._blocks if isinstance(b, DuplicateBlockKeyBlock)] + @property def strings(self) -> List[String]: """All @string blocks in the library, preserving order of insertion."""