Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,7 @@ absolute_sources = $(shell echo $(project_root)/src \
$$(find $(project_root)/terraform{,/gitlab,/shared,/browser} \
$(project_root)/lambdas/{indexer,service}{,/.chalice} \
$(project_root)/.github \
$(project_root)/resources \
-maxdepth 1 \
-name '*.template.py' \
-type f ))
Expand Down
6 changes: 3 additions & 3 deletions docs/mirror.rst
Original file line number Diff line number Diff line change
Expand Up @@ -160,9 +160,9 @@ response from that endpoint contains the file's mirror URI at
``hits[].files[].azul_mirror_uri``. The mirror URI is of the form
``s3://${bucket}/file/${digest_value}.${digest_type}`` where ``digest_type`` and
``digest_value`` denote the primary digest of the file. If the property
``azul_mirror_uri`` is absent from the Azul response, the mirror will not
include that file. If the response property is present, the mirror will very
likely include the file.
``azul_mirror_uri`` is null in the Azul response, the mirror will not include
that file. If the response property is non-null, the mirror will very likely
include the file.

.. [8] https://service.azul.data.humancellatlas.org/
.. [9] https://service.explore.anvilproject.org/
Expand Down
8 changes: 0 additions & 8 deletions lambdas/indexer/vendor/resources/environ.json.template.py

This file was deleted.

2 changes: 1 addition & 1 deletion lambdas/indexer/vendor/resources/static/schemas
2 changes: 1 addition & 1 deletion lambdas/indexer/vendor/resources/static/swagger
5 changes: 4 additions & 1 deletion lambdas/lambdas.mk
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@ config: .chalice/config.json
.PHONY: environ
environ: vendor/resources/environ.json

.PHONY: sources
sources: vendor/resources/sources.json

.PHONY: local
local: check_python config
chalice local
Expand All @@ -52,7 +55,7 @@ local: check_python config
clean: git_clean_recursive

.PHONY: package
package: check_branch check_python check_aws config environ compile
package: check_branch check_python check_aws config environ sources compile
chalice package --stage $(AZUL_DEPLOYMENT_STAGE) --pkg-format terraform .chalice/terraform

.PHONY: openapi
Expand Down
8 changes: 0 additions & 8 deletions lambdas/service/vendor/resources/environ.json.template.py

This file was deleted.

2 changes: 1 addition & 1 deletion lambdas/service/vendor/resources/static/swagger
8 changes: 8 additions & 0 deletions resources/environ.json.template.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from azul import (
config,
)
from azul.template import (
emit,
)

emit(config.lambda_env_for_outsourcing)
8 changes: 8 additions & 0 deletions resources/sources.json.template.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from azul.service.source_service import (
SourceService,
)
from azul.template import (
emit,
)

emit(SourceService().configured_sources_for_outsourcing)
File renamed without changes.
File renamed without changes.
File renamed without changes.
2 changes: 1 addition & 1 deletion scripts/mirror.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def mirror_catalog(azul: AzulClient,
fail_queue)
public_sources_by_spec = {
source.spec: source
for source in plugin.list_sources(authentication=None)
for source in plugin.list_accessible_sources(authentication=None)
}
# When the user doesn't specify a source or provides "*" as a source glob,
# we implicitly filter out managed-access sources. This lets us assert that
Expand Down
2 changes: 1 addition & 1 deletion scripts/update_swagger.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
'swagger-initializer.js'
]

swagger_dir = Path(config.project_root) / 'swagger'
swagger_dir = Path(config.project_root) / 'resources/static/swagger'


def download_file(name: str):
Expand Down
7 changes: 5 additions & 2 deletions src/azul/indexer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -423,14 +423,15 @@ class SourceSpec(Parseable, metaclass=ABCMeta):
are structured might want to implement this abstract class. Plugins that
have simple unstructured names may want to use :class:`SimpleSourceSpec`.
"""
#: Assumed to be unique per catalog.
name: str


@attrs.frozen(kw_only=True)
class SimpleSourceSpec(SourceSpec):
"""
Default implementation for unstructured source names.
"""
name: str

@classmethod
def parse(cls, spec: str) -> Self:
Expand Down Expand Up @@ -465,7 +466,9 @@ class SourceRef[SOURCE_SPEC: SourceSpec](

Note to plugin implementers: Since the source ID can't be assumed to be
globally unique, plugins should subclass this class, even if the subclass
body is empty.
body is empty. Additionally, subclasses must not add any fields that are
required by the constructor, since the base repository plugin needs to be
able to instantiate them generically.

>>> spec = SimpleSourceSpec(name='')
>>> prefix = Prefix(partition=0)
Expand Down
38 changes: 22 additions & 16 deletions src/azul/indexer/mirror_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,9 +333,9 @@ class BaseMirrorService:
def _queues(self) -> Queues:
return Queues()

@cached_property
def _repository_plugin(self) -> RepositoryPlugin:
return RepositoryPlugin.load(self.catalog).create(self.catalog)
@property
def repository_plugin(self) -> RepositoryPlugin:
return self._source_service.repository_plugin(self.catalog)

@cached_property
def _storage(self) -> StorageService:
Expand All @@ -344,6 +344,10 @@ def _storage(self) -> StorageService:
bucket = aws.mirror_bucket
return StorageService(bucket)

@cached_property
def _source_service(self) -> SourceService:
return SourceService()

def may_mirror_files_from_source(self, source_spec: SourceSpec) -> bool:
"""
Test whether it makes sense to request the mirroring of files from the
Expand All @@ -352,9 +356,16 @@ def may_mirror_files_from_source(self, source_spec: SourceSpec) -> bool:
definitely refuse to mirror all files from the source.
"""
if self.may_mirror():
plugin = self._repository_plugin
plugin = self.repository_plugin
source_config = plugin.sources[source_spec]
return source_config.mirror
if source_config.mirror:
is_public = any(
source_spec == source.spec
for source in self._source_service.configured_public_sources
)
return is_public
else:
return False
else:
return False

Expand Down Expand Up @@ -519,10 +530,6 @@ class MirrorService(BaseMirrorService, HasCachedHttpClient):

_schema_url_func: SchemaUrlFunc

@cached_property
def _source_service(self) -> SourceService:
return SourceService()

# We don't store the mirrored files' actual content type(s) in S3's
# `Content-Type` metadata because a single file object may store the
# contents of multiple file metadata entities, which may declare different
Expand All @@ -547,9 +554,11 @@ def _mirror(self, a: MirrorAction):

@_mirror.register
def _(self, a: MirrorSourceAction) -> Iterator[MirrorAction]:
assert a.source.id in self._list_public_source_ids(), R(
public_sources = self._source_service.list_accessible_source_ids(self.catalog,
authentication=None)
assert a.source.id in public_sources, R(
'Cannot mirror non-public source', a.source)
plugin = self._repository_plugin
plugin = self.repository_plugin
# The desired partition size depends on the maximum number of messages
# we can send in one Lambda invocation, because queueing the individual
# mirror_file messages turns out to dominate the running time of
Expand All @@ -572,12 +581,9 @@ def _(self, a: MirrorSourceAction) -> Iterator[MirrorAction]:
for partition in prefix.partition_prefixes():
yield devolve(MirrorPartitionAction, a, source=source, prefix=partition)

def _list_public_source_ids(self) -> set[str]:
return self._source_service.list_source_ids(self.catalog, authentication=None)

@_mirror.register
def _(self, a: MirrorPartitionAction) -> Iterator[MirrorAction]:
plugin = self._repository_plugin
plugin = self.repository_plugin
files = plugin.list_files(a.source, a.prefix)
for file in files:
assert file.size is not None, R('File size unknown', file)
Expand Down Expand Up @@ -723,7 +729,7 @@ def _repository_url(self, file: File) -> furl:
'Only TDR catalogs are supported', self.catalog)
assert file.drs_uri is not None, R(
'File cannot be downloaded', file)
object = self._repository_plugin.drs_object(file.drs_uri)
object = self.repository_plugin.drs_object(file.drs_uri)
access = object.get(AccessMethod.gs)
assert access.method is AccessMethod.https, access
return furl(access.url)
Expand Down
45 changes: 37 additions & 8 deletions src/azul/plugins/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -653,10 +653,31 @@ def _assert_partition(self, source: SOURCE_REF, prefix: str):
assert source.prefix is not None, source
assert prefix in source.prefix, (source, prefix)

def _match_sources(self,
source_names_by_id: Mapping[str, str]
) -> list[SOURCE_REF]:
"""
Filter the given sources to only include sources that the plugin is
configured to read metadata from, and instantiate them as `SourceRef`s.
"""
configured_specs_by_name = {spec.name: spec for spec in self.sources}
source_ids_by_name = {
name: id
for id, name in source_names_by_id.items()
if name in configured_specs_by_name
}
source_ref_cls = self.source_ref_cls
return [
source_ref_cls(id=id,
spec=configured_specs_by_name[name],
prefix=None)
for name, id in source_ids_by_name.items()
]

@abstractmethod
def list_sources(self,
authentication: Authentication | None
) -> Iterable[SOURCE_REF]:
def list_accessible_sources(self,
authentication: Authentication | None
) -> Iterable[SOURCE_REF]:
"""
The sources the plugin is configured to read metadata from that are
accessible using the provided authentication. Retrieving this
Expand All @@ -666,20 +687,28 @@ def list_sources(self,
"""
raise NotImplementedError

def list_source_ids(self,
authentication: Authentication | None
) -> set[str]:
@abstractmethod
def list_sources(self) -> Iterable[SOURCE_REF]:
"""
The sources the plugin is configured to read metadata from. Retrieving
this information may require a round-trip to the underlying repository.
"""
raise NotImplementedError

def list_accessible_source_ids(self,
authentication: Authentication | None
) -> set[str]:
"""
List source IDs in the underlying repository that are accessible using
the provided authentication. Sources may be included even if they are
not configured to be read from. Subclasses should override this method
if it can be implemented more efficiently than `list_sources`.
if it can be implemented more efficiently than `list_accessible_sources`.

Retrieving this information may require a round-trip to the underlying
repository. Implementations should raise PermissionError if the provided
authentication is insufficient to access the repository.
"""
return {source.id for source in self.list_sources(authentication)}
return {source.id for source in self.list_accessible_sources(authentication)}

@cached_property
def _generic_params(self) -> dict[TypeVar, type]:
Expand Down
9 changes: 6 additions & 3 deletions src/azul/plugins/repository/canned/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,12 @@ class Plugin(RepositoryPlugin[
],
HasCachedHttpClient):

def list_sources(self,
authentication: Authentication | None
) -> list[CannedSourceRef]:
def list_accessible_sources(self,
authentication: Authentication | None
) -> list[CannedSourceRef]:
return self.list_sources()

def list_sources(self) -> list[CannedSourceRef]:
return [
CannedSourceRef(id=self._lookup_source_id(spec), spec=spec, prefix=None)
for spec in self.sources
Expand Down
9 changes: 6 additions & 3 deletions src/azul/plugins/repository/dss/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,9 +114,12 @@ def count_bundles(self, source: DSSSourceRef) -> NoReturn:
def count_files(self, source: DSSSourceRef) -> NoReturn:
assert False, 'DSS is EOL'

def list_sources(self,
authentication: Authentication | None
) -> list[DSSSourceRef]:
def list_accessible_sources(self,
authentication: Authentication | None
) -> list[DSSSourceRef]:
return self.list_sources()

def list_sources(self) -> list[DSSSourceRef]:
return [
DSSSourceRef(id=self._lookup_source_id(spec), spec=spec, prefix=None)
for spec in self.sources
Expand Down
Loading
Loading