Skip to content

Added parse_url() entrypoint #493

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions bibtexparser/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import bibtexparser.model
from bibtexparser.entrypoint import parse_file
from bibtexparser.entrypoint import parse_string
from bibtexparser.entrypoint import parse_url
from bibtexparser.entrypoint import write_file
from bibtexparser.entrypoint import write_string
from bibtexparser.library import Library
Expand Down
27 changes: 27 additions & 0 deletions bibtexparser/entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,34 @@ def parse_file(
bibtex_str, parse_stack=parse_stack, append_middleware=append_middleware
)

def parse_url(
url: str,
parse_stack: Optional[Iterable[Middleware]] = None,
append_middleware: Optional[Iterable[Middleware]] = None,
encoding: str = "UTF-8",
) -> Library:
"""Parse a BibTeX file from an URL

:param url: Url to BibTeX file
:param parse_stack:
List of middleware to apply to the database after splitting.
If ``None`` (default), a default stack will be used providing simple standard functionality.

:param append_middleware:
List of middleware to append to the default stack
(ignored if a not-``None`` parse_stack is passed).

:param encoding: Encoding of the .bib file. Default encoding is ``"UTF-8"``.
:return: Library: Parsed BibTeX library
"""
import urllib.request

with urllib.request.urlopen(url) as f:
bibtex_str = f.read().decode(encoding)
return parse_string(
bibtex_str, parse_stack=parse_stack, append_middleware=append_middleware
)

def write_file(
file: Union[str, TextIO],
library: Library,
Expand Down
48 changes: 48 additions & 0 deletions bibtexparser/library.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,13 @@
from .model import DuplicateBlockKeyBlock
from .model import Entry
from .model import ExplicitComment
from .model import Field
from .model import ImplicitComment
from .model import ParsingFailedBlock
from .model import Preamble
from .model import String


# TODO Use functools.lru_cache for library properties (which create lists when called)


Expand Down Expand Up @@ -162,6 +164,11 @@ def failed_blocks(self) -> List[ParsingFailedBlock]:
"""All blocks that could not be parsed, preserving order of insertion."""
return [b for b in self._blocks if isinstance(b, ParsingFailedBlock)]

@property
def duplicate_blocks(self) -> List[DuplicateBlockKeyBlock]:
"""All blocks that could not be parsed, preserving order of insertion."""
return [b for b in self._blocks if isinstance(b, DuplicateBlockKeyBlock)]

@property
def strings(self) -> List[String]:
"""All @string blocks in the library, preserving order of insertion."""
Expand Down Expand Up @@ -195,3 +202,44 @@ def comments(self) -> List[Union[ExplicitComment, ImplicitComment]]:
return [
block for block in self._blocks if isinstance(block, (ExplicitComment, ImplicitComment))
]

def filter(self,
filter: Dict,
case_sensitive = False
) -> List[Entry]:
""" Return filtered list of entries. Filter is a dict."""
entries = []


# Transform List in set
for k in filter.keys():
if not isinstance(filter[k], set):
if isinstance(filter[k], List):
filter[k] = set([x.lower() if not case_sensitive and isinstance(x, str) else x for x in filter[k]])
else:
x = filter[k].lower() if not case_sensitive and isinstance(filter[k], str) else filter[k]
filter[k] = set([x])

for block in self._blocks:
if isinstance(block, Entry):
found = True
for key in filter.keys():
if key in block.fields_dict.keys():
if isinstance(block.fields_dict[key], Field):
if isinstance(block.fields_dict[key].value, List):
bset = set([x.lower() if not case_sensitive and isinstance(x, str) else x for x in block.fields_dict[key].value])
else:
x = block.fields_dict[key].value.lower() if not case_sensitive and isinstance(block.fields_dict[key].value, str) else block.fields_dict[key].value
bset = set([x])

if not set(bset).intersection(filter[key]):
found = False
break
else:
found = False
else:
found = False
if found:
entries.append(block)

return entries
2 changes: 2 additions & 0 deletions bibtexparser/middlewares/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from bibtexparser.middlewares.interpolate import ResolveStringReferencesMiddleware
from bibtexparser.middlewares.latex_encoding import LatexDecodingMiddleware
from bibtexparser.middlewares.latex_encoding import LatexEncodingMiddleware
from bibtexparser.middlewares.lists import SeparateCSVLists
from bibtexparser.middlewares.middleware import BlockMiddleware
from bibtexparser.middlewares.middleware import LibraryMiddleware
from bibtexparser.middlewares.month import MonthAbbreviationMiddleware
Expand All @@ -15,6 +16,7 @@
from bibtexparser.middlewares.names import SeparateCoAuthors
from bibtexparser.middlewares.names import SplitNameParts
from bibtexparser.middlewares.sorting_blocks import SortBlocksByTypeAndKeyMiddleware
from bibtexparser.middlewares.sorting_blocks import SortBlocksByYearMonthDayMiddleware
from bibtexparser.middlewares.sorting_entry_fields import SortFieldsAlphabeticallyMiddleware
from bibtexparser.middlewares.sorting_entry_fields import SortFieldsCustomMiddleware

Expand Down
61 changes: 61 additions & 0 deletions bibtexparser/middlewares/lists.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import abc
from typing import List, Literal, Tuple

from bibtexparser.model import Block, Entry, Field

from .middleware import BlockMiddleware

class _ListTransformerMiddleware(BlockMiddleware, abc.ABC):
"""Internal utility class - superclass for all name-transforming middlewares.

:param allow_inplace_modification: See corresponding property.
:param name_fields: The fields that contain names, considered by this middleware."""

def __init__(
self,
allow_inplace_modification: bool = True,
field_names: Tuple[str] = (),
):
super().__init__(
allow_inplace_modification=allow_inplace_modification,
allow_parallel_execution=True,
)
self._field_names = field_names

@property
def field_names(self) -> Tuple[str]:
"""The fields that contain names, considered by this middleware."""
return self._field_names

@abc.abstractmethod
def _transform_field_value(self, name):
raise NotImplementedError("called abstract method")

# docstr-coverage: inherited
def transform_entry(self, entry: Entry, *args, **kwargs) -> Block:
field: Field

for field in entry.fields:
if field.key in self.field_names:
field.value = self._transform_field_value(field.value)
return entry


def split_comma_separated_list(string):
"""Helper function to split a list of comma separated values."""
import re
pattern = re.compile(r'\s*,\s*') # Remove extra spaces before and after comma
return re.sub(pattern, ',', string).split(",")


class SeparateCSVLists(_ListTransformerMiddleware):
"""Middleware to separate comma-separated values in fields."""

# docstr-coverage: inherited
@classmethod
def metadata_key(cls) -> str:
return "separate_lists"

# docstr-coverage: inherited
def _transform_field_value(self, string) -> List[str]:
return split_comma_separated_list(string)
93 changes: 93 additions & 0 deletions bibtexparser/middlewares/sorting_blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,3 +120,96 @@ def _sort_key(block: Block):

blocks.sort(key=_sort_key)
return Library(blocks=blocks)


class SortBlocksByYearMonthDayMiddleware(LibraryMiddleware):
"""Sorts the blocks of a library by year, month and day.

:param descending: uses descending ordering (ascending by default)
:param preserve_comments_on_top: comments remain above same block (default True)
"""

def __init__(
self,
preserve_comments_on_top: bool = True,
descending = False
):
self._preserve_comments_on_top = preserve_comments_on_top
self._descending = descending

# In-place modification is not yet supported, we make this explicit here,
super().__init__(allow_inplace_modification=False)

@staticmethod
# Sort blocks by year and month (default 0 in case entry has no year or month)
# Month should be an integer (recommended to use MonthIntMiddleware beforehand)
def _sort_key(block: Block):
month = 0
year = 0
day = 0
try:
try:
v = block.fields_dict["day"].value
if isinstance(v, str) and v.isdigit():
v = int(v)
if isinstance(v, int):
if v >= 1 or v <= 31:
day = v
except KeyError:
# No year field
pass
try:
v = block.fields_dict["month"].value
if isinstance(v, str) and v.isdigit():
v = int(v)
if isinstance(v, int):
if v >= 1 or v <= 12:
month = v
except KeyError:
# No month field
pass
try:
year = int(block.fields_dict["year"].value)
except KeyError:
# No year field
pass
except AttributeError:
# No fields_dict (e.g. Comments)
pass
return year, month, day

# docstr-coverage: inherited
def transform(self, library: Library) -> Library:
blocks = deepcopy(library.blocks)

if self._preserve_comments_on_top:
# We start creating a new list of block_junks (made of comments and entries)
block_junks = []
current_junk = _BlockJunk()
for block in blocks:
current_junk.blocks.append(block)
current_junk.sort_key = self._sort_key(block)

if not (
isinstance(block, ExplicitComment) or isinstance(block, ImplicitComment)
):
# We added a non-comment block, hence we finish the junk and
# start a new one
block_junks.append(current_junk)
current_junk = _BlockJunk()

if current_junk.blocks:
# That would be a junk with only comments, but we add it at the end for completeness
block_junks.append(current_junk)

def _sort_key(block_junk):
return block_junk.sort_key

block_junks.sort(key=_sort_key, reverse=self._descending)
return Library(
blocks=[block for block_junk in block_junks for block in block_junk.blocks]
)

else:
blocks.sort(key=self._sort_key)
return Library(blocks=blocks)