diff --git a/alembic/versions/2025_11_15_1441-de0305465e2c_update_record_formats_and_access_types_.py b/alembic/versions/2025_11_15_1441-de0305465e2c_update_record_formats_and_access_types_.py new file mode 100644 index 00000000..e9e14ca8 --- /dev/null +++ b/alembic/versions/2025_11_15_1441-de0305465e2c_update_record_formats_and_access_types_.py @@ -0,0 +1,67 @@ +"""Update record_formats and access_types to be not null + +Revision ID: de0305465e2c +Revises: a57c3b5b6e93 +Create Date: 2025-11-15 14:41:45.619148 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = 'de0305465e2c' +down_revision: Union[str, None] = 'a57c3b5b6e93' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +TABLE_NAME = "url_optional_data_source_metadata" + + +def upgrade() -> None: + _update_record_formats() + _update_access_types() + _alter_record_formats_column() + _alter_access_types_column() + +def _alter_record_formats_column(): + op.alter_column( + table_name=TABLE_NAME, + column_name="record_formats", + nullable=False, + server_default='{}' + ) + + +def _alter_access_types_column(): + op.alter_column( + table_name=TABLE_NAME, + column_name="access_types", + nullable=False, + server_default='{}' + ) + + + +def _update_access_types(): + op.execute(""" + UPDATE url_optional_data_source_metadata + SET access_types = '{}' + WHERE access_types is null + + """) + + +def _update_record_formats(): + op.execute(""" + UPDATE url_optional_data_source_metadata + SET record_formats = '{}' + WHERE record_formats is null + """) + + +def downgrade() -> None: + pass diff --git a/src/api/endpoints/collector/manual/query.py b/src/api/endpoints/collector/manual/query.py index dff2cbed..5ebe0e4b 100644 --- a/src/api/endpoints/collector/manual/query.py +++ b/src/api/endpoints/collector/manual/query.py @@ -84,9 +84,10 @@ async def run(self, session: AsyncSession) -> ManualBatchResponseDTO: optional_metadata = URLOptionalDataSourceMetadata( url_id=url.id, - record_formats=entry.record_formats, + record_formats=entry.record_formats or [], data_portal_type=entry.data_portal_type, supplying_entity=entry.supplying_entity, + access_types=[] ) session.add(optional_metadata) url_ids.append(url.id) diff --git a/src/api/endpoints/contributions/user/queries/core.py b/src/api/endpoints/contributions/user/queries/core.py index 57727215..1709776c 100644 --- a/src/api/endpoints/contributions/user/queries/core.py +++ b/src/api/endpoints/contributions/user/queries/core.py @@ -33,15 +33,15 @@ async def run(self, session: AsyncSession) -> ContributionsUserResponse: agency_agree.agreement.label("agency"), url_type_agree.agreement.label("url_type") ) - .join( + .outerjoin( record_type_agree.cte, contributions_cte.user_id == record_type_agree.user_id ) - .join( + .outerjoin( agency_agree.cte, contributions_cte.user_id == agency_agree.user_id ) - .join( + .outerjoin( url_type_agree.cte, contributions_cte.user_id == url_type_agree.user_id ) diff --git a/src/api/endpoints/review/approve/query_/core.py b/src/api/endpoints/review/approve/query_/core.py index ff7a1c1f..b05c6c67 100644 --- a/src/api/endpoints/review/approve/query_/core.py +++ b/src/api/endpoints/review/approve/query_/core.py @@ -66,9 +66,10 @@ async def _optionally_update_optional_metdata(self, url: URL) -> None: optional_metadata = url.optional_data_source_metadata if optional_metadata is None: url.optional_data_source_metadata = URLOptionalDataSourceMetadata( - record_formats=self.approval_info.record_formats, + record_formats=self.approval_info.record_formats or [], data_portal_type=self.approval_info.data_portal_type, - supplying_entity=self.approval_info.supplying_entity + supplying_entity=self.approval_info.supplying_entity, + access_types=[] ) else: update_if_not_none( diff --git a/src/core/tasks/scheduled/loader.py b/src/core/tasks/scheduled/loader.py index f104b84f..3ea4fc94 100644 --- a/src/core/tasks/scheduled/loader.py +++ b/src/core/tasks/scheduled/loader.py @@ -127,8 +127,8 @@ async def load_entries(self) -> list[ScheduledTaskEntry]: enabled=self.setup_flag("REFRESH_MATERIALIZED_VIEWS_TASK_FLAG") ), # Sync - ## Agency - ### Add + ## Adds + ### Agency ScheduledTaskEntry( operator=DSAppSyncAgenciesAddTaskOperator( adb_client=self.adb_client, @@ -137,78 +137,79 @@ async def load_entries(self) -> list[ScheduledTaskEntry]: interval_minutes=IntervalEnum.HOURLY.value, enabled=self.setup_flag("DS_APP_SYNC_AGENCY_ADD_TASK_FLAG") ), - ### Update + ### Meta URL ScheduledTaskEntry( - operator=DSAppSyncAgenciesUpdateTaskOperator( + operator=DSAppSyncMetaURLsAddTaskOperator( adb_client=self.adb_client, pdap_client=self.pdap_client ), interval_minutes=IntervalEnum.HOURLY.value, - enabled=self.setup_flag("DS_APP_SYNC_AGENCY_UPDATE_TASK_FLAG") + enabled=self.setup_flag("DS_APP_SYNC_META_URL_ADD_TASK_FLAG") ), - ### Delete + ### Data Source ScheduledTaskEntry( - operator=DSAppSyncAgenciesDeleteTaskOperator( + operator=DSAppSyncDataSourcesAddTaskOperator( adb_client=self.adb_client, pdap_client=self.pdap_client ), interval_minutes=IntervalEnum.HOURLY.value, - enabled=self.setup_flag("DS_APP_SYNC_AGENCY_DELETE_TASK_FLAG") + enabled=self.setup_flag("DS_APP_SYNC_DATA_SOURCE_ADD_TASK_FLAG") ), - ## Data Source - ### Add + ## Updates + ### Agency ScheduledTaskEntry( - operator=DSAppSyncDataSourcesAddTaskOperator( + operator=DSAppSyncAgenciesUpdateTaskOperator( adb_client=self.adb_client, pdap_client=self.pdap_client ), interval_minutes=IntervalEnum.HOURLY.value, - enabled=self.setup_flag("DS_APP_SYNC_DATA_SOURCE_ADD_TASK_FLAG") + enabled=self.setup_flag("DS_APP_SYNC_AGENCY_UPDATE_TASK_FLAG") ), - ### Update + ### Meta URL ScheduledTaskEntry( - operator=DSAppSyncDataSourcesUpdateTaskOperator( + operator=DSAppSyncMetaURLsUpdateTaskOperator( adb_client=self.adb_client, pdap_client=self.pdap_client ), interval_minutes=IntervalEnum.HOURLY.value, - enabled=self.setup_flag("DS_APP_SYNC_DATA_SOURCE_UPDATE_TASK_FLAG") + enabled=self.setup_flag("DS_APP_SYNC_META_URL_UPDATE_TASK_FLAG") ), - ### Delete + ### Data Source ScheduledTaskEntry( - operator=DSAppSyncDataSourcesDeleteTaskOperator( + operator=DSAppSyncDataSourcesUpdateTaskOperator( adb_client=self.adb_client, pdap_client=self.pdap_client ), interval_minutes=IntervalEnum.HOURLY.value, - enabled=self.setup_flag("DS_APP_SYNC_DATA_SOURCE_DELETE_TASK_FLAG") + enabled=self.setup_flag("DS_APP_SYNC_DATA_SOURCE_UPDATE_TASK_FLAG") ), - ## Meta URL - ### Add + ## Deletes + ### Data Source ScheduledTaskEntry( - operator=DSAppSyncMetaURLsAddTaskOperator( + operator=DSAppSyncDataSourcesDeleteTaskOperator( adb_client=self.adb_client, pdap_client=self.pdap_client ), interval_minutes=IntervalEnum.HOURLY.value, - enabled=self.setup_flag("DS_APP_SYNC_META_URL_ADD_TASK_FLAG") + enabled=self.setup_flag("DS_APP_SYNC_DATA_SOURCE_DELETE_TASK_FLAG") ), - ### Update + ### Meta URL ScheduledTaskEntry( - operator=DSAppSyncMetaURLsUpdateTaskOperator( + operator=DSAppSyncMetaURLsDeleteTaskOperator( adb_client=self.adb_client, pdap_client=self.pdap_client ), interval_minutes=IntervalEnum.HOURLY.value, - enabled=self.setup_flag("DS_APP_SYNC_META_URL_UPDATE_TASK_FLAG") + enabled=self.setup_flag("DS_APP_SYNC_META_URL_DELETE_TASK_FLAG") ), - ### Delete + ### Agency ScheduledTaskEntry( - operator=DSAppSyncMetaURLsDeleteTaskOperator( + operator=DSAppSyncAgenciesDeleteTaskOperator( adb_client=self.adb_client, pdap_client=self.pdap_client ), interval_minutes=IntervalEnum.HOURLY.value, - enabled=self.setup_flag("DS_APP_SYNC_META_URL_DELETE_TASK_FLAG") - ) + enabled=self.setup_flag("DS_APP_SYNC_AGENCY_DELETE_TASK_FLAG") + ), + ] diff --git a/src/db/client/async_.py b/src/db/client/async_.py index 95bc7082..50802347 100644 --- a/src/db/client/async_.py +++ b/src/db/client/async_.py @@ -371,9 +371,10 @@ async def add_miscellaneous_metadata(self, session: AsyncSession, tdos: list[URL for tdo in tdos: metadata_object = URLOptionalDataSourceMetadata( url_id=tdo.url_id, - record_formats=tdo.record_formats, + record_formats=tdo.record_formats or [], data_portal_type=tdo.data_portal_type, - supplying_entity=tdo.supplying_entity + supplying_entity=tdo.supplying_entity, + access_types=[], ) session.add(metadata_object) diff --git a/src/db/models/impl/url/optional_ds_metadata/sqlalchemy.py b/src/db/models/impl/url/optional_ds_metadata/sqlalchemy.py index 4661be7a..32156a38 100644 --- a/src/db/models/impl/url/optional_ds_metadata/sqlalchemy.py +++ b/src/db/models/impl/url/optional_ds_metadata/sqlalchemy.py @@ -15,7 +15,7 @@ class URLOptionalDataSourceMetadata( ): __tablename__ = 'url_optional_data_source_metadata' - record_formats = Column(ARRAY(String), nullable=True) + record_formats = Column(ARRAY(String), nullable=False, default=[]) data_portal_type = Column(String, nullable=True) supplying_entity = Column(String, nullable=True) coverage_start = Column(Date, nullable=True) @@ -38,7 +38,7 @@ class URLOptionalDataSourceMetadata( native_enum=True, values_callable=lambda AccessTypeEnum: [e.value for e in AccessTypeEnum] ) - ), nullable=True) + ), nullable=False, default=[]) data_portal_type_other = Column(String, nullable=True) # Relationships diff --git a/tests/automated/integration/api/test_manual_batch.py b/tests/automated/integration/api/test_manual_batch.py index 9e52d358..fa3f7884 100644 --- a/tests/automated/integration/api/test_manual_batch.py +++ b/tests/automated/integration/api/test_manual_batch.py @@ -121,7 +121,10 @@ def check_url(url: URL, url_only: bool): def check_opt_metadata(metadata: URLOptionalDataSourceMetadata, no_optional: bool): assert metadata.url_id is not None - other_attributes = ["record_formats", "data_portal_type", "supplying_entity"] + other_attributes = [ + "data_portal_type", + "supplying_entity" + ] return check_attributes(metadata, other_attributes, no_optional) # Confirm 50 have nothing but URL id diff --git a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/test_add.py b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/test_add.py index 060637db..b90bb761 100644 --- a/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/test_add.py +++ b/tests/automated/integration/tasks/scheduled/impl/sync_to_ds/data_source/test_add.py @@ -5,6 +5,7 @@ from src.db.client.async_ import AsyncDatabaseClient from src.db.models.impl.url.data_source.sqlalchemy import DSAppLinkDataSource from src.external.pdap.client import PDAPClient +from src.external.pdap.enums import DataSourcesURLStatus from src.external.pdap.impl.sync.data_sources._shared.content import DataSourceSyncContentModel from src.external.pdap.impl.sync.data_sources.add.request import AddDataSourcesOuterRequest, AddDataSourcesInnerRequest from src.external.pdap.impl.sync.shared.models.add.response import DSAppSyncAddResponseModel, \ @@ -78,7 +79,7 @@ async def test_add( assert content.access_notes is None assert content.access_types is None assert content.data_portal_type_other is None - assert content.url_status is None + assert content.url_status == DataSourcesURLStatus.OK assert content.agency_ids == [test_agency_id] diff --git a/tests/automated/integration/tasks/url/impl/test_url_miscellaneous_metadata_task.py b/tests/automated/integration/tasks/url/impl/test_url_miscellaneous_metadata_task.py index 93878562..bc3f240d 100644 --- a/tests/automated/integration/tasks/url/impl/test_url_miscellaneous_metadata_task.py +++ b/tests/automated/integration/tasks/url/impl/test_url_miscellaneous_metadata_task.py @@ -122,12 +122,12 @@ async def test_url_miscellaneous_metadata_task(db_data_creator: DBDataCreator): assert url.description == expected_description, f"For url.id {url.id}, expected description {expected_description}, got {url.description}" expected_urls = { - common_crawler_url_id: (None, None, None), - auto_googler_url_id: (None, None, None), + common_crawler_url_id: ([], None, None), + auto_googler_url_id: ([], None, None), ckan_url_id: (["CSV", "JSON"], "Test Data Portal Type", "Test Supplying Entity"), - muckrock_simple_url_id: (None, None, None), - muckrock_county_url_id: (None, None, None), - muckrock_all_url_id: (None, None, None), + muckrock_simple_url_id: ([], None, None), + muckrock_county_url_id: ([], None, None), + muckrock_all_url_id: ([], None, None), } metadatas: list[URLOptionalDataSourceMetadata] = await db_data_creator.adb_client.get_all(URLOptionalDataSourceMetadata)