diff --git a/bibtexparser/__init__.py b/bibtexparser/__init__.py index 8239d3d..31609ca 100644 --- a/bibtexparser/__init__.py +++ b/bibtexparser/__init__.py @@ -3,6 +3,7 @@ import bibtexparser.model from bibtexparser.entrypoint import parse_file from bibtexparser.entrypoint import parse_string +from bibtexparser.entrypoint import parse_url from bibtexparser.entrypoint import write_file from bibtexparser.entrypoint import write_string from bibtexparser.library import Library diff --git a/bibtexparser/entrypoint.py b/bibtexparser/entrypoint.py index f1bad67..cbd3db5 100644 --- a/bibtexparser/entrypoint.py +++ b/bibtexparser/entrypoint.py @@ -132,7 +132,34 @@ def parse_file( bibtex_str, parse_stack=parse_stack, append_middleware=append_middleware ) +def parse_url( + url: str, + parse_stack: Optional[Iterable[Middleware]] = None, + append_middleware: Optional[Iterable[Middleware]] = None, + encoding: str = "UTF-8", +) -> Library: + """Parse a BibTeX file from an URL + + :param url: Url to BibTeX file + :param parse_stack: + List of middleware to apply to the database after splitting. + If ``None`` (default), a default stack will be used providing simple standard functionality. + :param append_middleware: + List of middleware to append to the default stack + (ignored if a not-``None`` parse_stack is passed). + + :param encoding: Encoding of the .bib file. Default encoding is ``"UTF-8"``. + :return: Library: Parsed BibTeX library + """ + import urllib.request + + with urllib.request.urlopen(url) as f: + bibtex_str = f.read().decode(encoding) + return parse_string( + bibtex_str, parse_stack=parse_stack, append_middleware=append_middleware + ) + def write_file( file: Union[str, TextIO], library: Library, diff --git a/bibtexparser/library.py b/bibtexparser/library.py index fecd1bc..03e5279 100644 --- a/bibtexparser/library.py +++ b/bibtexparser/library.py @@ -6,11 +6,13 @@ from .model import DuplicateBlockKeyBlock from .model import Entry from .model import ExplicitComment +from .model import Field from .model import ImplicitComment from .model import ParsingFailedBlock from .model import Preamble from .model import String + # TODO Use functools.lru_cache for library properties (which create lists when called) @@ -162,6 +164,11 @@ def failed_blocks(self) -> List[ParsingFailedBlock]: """All blocks that could not be parsed, preserving order of insertion.""" return [b for b in self._blocks if isinstance(b, ParsingFailedBlock)] + @property + def duplicate_blocks(self) -> List[DuplicateBlockKeyBlock]: + """All blocks that could not be parsed, preserving order of insertion.""" + return [b for b in self._blocks if isinstance(b, DuplicateBlockKeyBlock)] + @property def strings(self) -> List[String]: """All @string blocks in the library, preserving order of insertion.""" @@ -195,3 +202,44 @@ def comments(self) -> List[Union[ExplicitComment, ImplicitComment]]: return [ block for block in self._blocks if isinstance(block, (ExplicitComment, ImplicitComment)) ] + + def filter(self, + filter: Dict, + case_sensitive = False + ) -> List[Entry]: + """ Return filtered list of entries. Filter is a dict.""" + entries = [] + + + # Transform List in set + for k in filter.keys(): + if not isinstance(filter[k], set): + if isinstance(filter[k], List): + filter[k] = set([x.lower() if not case_sensitive and isinstance(x, str) else x for x in filter[k]]) + else: + x = filter[k].lower() if not case_sensitive and isinstance(filter[k], str) else filter[k] + filter[k] = set([x]) + + for block in self._blocks: + if isinstance(block, Entry): + found = True + for key in filter.keys(): + if key in block.fields_dict.keys(): + if isinstance(block.fields_dict[key], Field): + if isinstance(block.fields_dict[key].value, List): + bset = set([x.lower() if not case_sensitive and isinstance(x, str) else x for x in block.fields_dict[key].value]) + else: + x = block.fields_dict[key].value.lower() if not case_sensitive and isinstance(block.fields_dict[key].value, str) else block.fields_dict[key].value + bset = set([x]) + + if not set(bset).intersection(filter[key]): + found = False + break + else: + found = False + else: + found = False + if found: + entries.append(block) + + return entries diff --git a/bibtexparser/middlewares/__init__.py b/bibtexparser/middlewares/__init__.py index 3fd1a36..c7ac7c6 100644 --- a/bibtexparser/middlewares/__init__.py +++ b/bibtexparser/middlewares/__init__.py @@ -4,6 +4,7 @@ from bibtexparser.middlewares.interpolate import ResolveStringReferencesMiddleware from bibtexparser.middlewares.latex_encoding import LatexDecodingMiddleware from bibtexparser.middlewares.latex_encoding import LatexEncodingMiddleware +from bibtexparser.middlewares.lists import SeparateCSVLists from bibtexparser.middlewares.middleware import BlockMiddleware from bibtexparser.middlewares.middleware import LibraryMiddleware from bibtexparser.middlewares.month import MonthAbbreviationMiddleware @@ -15,6 +16,7 @@ from bibtexparser.middlewares.names import SeparateCoAuthors from bibtexparser.middlewares.names import SplitNameParts from bibtexparser.middlewares.sorting_blocks import SortBlocksByTypeAndKeyMiddleware +from bibtexparser.middlewares.sorting_blocks import SortBlocksByYearMonthDayMiddleware from bibtexparser.middlewares.sorting_entry_fields import SortFieldsAlphabeticallyMiddleware from bibtexparser.middlewares.sorting_entry_fields import SortFieldsCustomMiddleware diff --git a/bibtexparser/middlewares/lists.py b/bibtexparser/middlewares/lists.py new file mode 100644 index 0000000..e8b4ae3 --- /dev/null +++ b/bibtexparser/middlewares/lists.py @@ -0,0 +1,61 @@ +import abc +from typing import List, Literal, Tuple + +from bibtexparser.model import Block, Entry, Field + +from .middleware import BlockMiddleware + +class _ListTransformerMiddleware(BlockMiddleware, abc.ABC): + """Internal utility class - superclass for all name-transforming middlewares. + + :param allow_inplace_modification: See corresponding property. + :param name_fields: The fields that contain names, considered by this middleware.""" + + def __init__( + self, + allow_inplace_modification: bool = True, + field_names: Tuple[str] = (), + ): + super().__init__( + allow_inplace_modification=allow_inplace_modification, + allow_parallel_execution=True, + ) + self._field_names = field_names + + @property + def field_names(self) -> Tuple[str]: + """The fields that contain names, considered by this middleware.""" + return self._field_names + + @abc.abstractmethod + def _transform_field_value(self, name): + raise NotImplementedError("called abstract method") + + # docstr-coverage: inherited + def transform_entry(self, entry: Entry, *args, **kwargs) -> Block: + field: Field + + for field in entry.fields: + if field.key in self.field_names: + field.value = self._transform_field_value(field.value) + return entry + + +def split_comma_separated_list(string): + """Helper function to split a list of comma separated values.""" + import re + pattern = re.compile(r'\s*,\s*') # Remove extra spaces before and after comma + return re.sub(pattern, ',', string).split(",") + + +class SeparateCSVLists(_ListTransformerMiddleware): + """Middleware to separate comma-separated values in fields.""" + + # docstr-coverage: inherited + @classmethod + def metadata_key(cls) -> str: + return "separate_lists" + + # docstr-coverage: inherited + def _transform_field_value(self, string) -> List[str]: + return split_comma_separated_list(string) \ No newline at end of file diff --git a/bibtexparser/middlewares/sorting_blocks.py b/bibtexparser/middlewares/sorting_blocks.py index 5ff5f13..1da317c 100644 --- a/bibtexparser/middlewares/sorting_blocks.py +++ b/bibtexparser/middlewares/sorting_blocks.py @@ -120,3 +120,96 @@ def _sort_key(block: Block): blocks.sort(key=_sort_key) return Library(blocks=blocks) + + +class SortBlocksByYearMonthDayMiddleware(LibraryMiddleware): + """Sorts the blocks of a library by year, month and day. + + :param descending: uses descending ordering (ascending by default) + :param preserve_comments_on_top: comments remain above same block (default True) + """ + + def __init__( + self, + preserve_comments_on_top: bool = True, + descending = False + ): + self._preserve_comments_on_top = preserve_comments_on_top + self._descending = descending + + # In-place modification is not yet supported, we make this explicit here, + super().__init__(allow_inplace_modification=False) + + @staticmethod + # Sort blocks by year and month (default 0 in case entry has no year or month) + # Month should be an integer (recommended to use MonthIntMiddleware beforehand) + def _sort_key(block: Block): + month = 0 + year = 0 + day = 0 + try: + try: + v = block.fields_dict["day"].value + if isinstance(v, str) and v.isdigit(): + v = int(v) + if isinstance(v, int): + if v >= 1 or v <= 31: + day = v + except KeyError: + # No year field + pass + try: + v = block.fields_dict["month"].value + if isinstance(v, str) and v.isdigit(): + v = int(v) + if isinstance(v, int): + if v >= 1 or v <= 12: + month = v + except KeyError: + # No month field + pass + try: + year = int(block.fields_dict["year"].value) + except KeyError: + # No year field + pass + except AttributeError: + # No fields_dict (e.g. Comments) + pass + return year, month, day + + # docstr-coverage: inherited + def transform(self, library: Library) -> Library: + blocks = deepcopy(library.blocks) + + if self._preserve_comments_on_top: + # We start creating a new list of block_junks (made of comments and entries) + block_junks = [] + current_junk = _BlockJunk() + for block in blocks: + current_junk.blocks.append(block) + current_junk.sort_key = self._sort_key(block) + + if not ( + isinstance(block, ExplicitComment) or isinstance(block, ImplicitComment) + ): + # We added a non-comment block, hence we finish the junk and + # start a new one + block_junks.append(current_junk) + current_junk = _BlockJunk() + + if current_junk.blocks: + # That would be a junk with only comments, but we add it at the end for completeness + block_junks.append(current_junk) + + def _sort_key(block_junk): + return block_junk.sort_key + + block_junks.sort(key=_sort_key, reverse=self._descending) + return Library( + blocks=[block for block_junk in block_junks for block in block_junk.blocks] + ) + + else: + blocks.sort(key=self._sort_key) + return Library(blocks=blocks)