Skip to content

Commit 6b0fccc

Browse files
authored
Merge pull request #523 from Police-Data-Accessibility-Project/mc_822_broken_urls
Add logic for propagating broken URLs
2 parents 772ef34 + 968b064 commit 6b0fccc

File tree

20 files changed

+263
-10
lines changed

20 files changed

+263
-10
lines changed

ENV.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ Note that some tasks/subtasks are themselves enabled by other tasks.
7070
| `DELETE_STALE_SCREENSHOTS_TASK_FLAG` | Deletes stale screenshots for URLs already validated. |
7171
| `TASK_CLEANUP_TASK_FLAG` | Cleans up tasks that are no longer needed. |
7272
| `REFRESH_MATERIALIZED_VIEWS_TASK_FLAG` | Refreshes materialized views. |
73+
| `UPDATE_URL_STATUS_TASK_FLAG` | Updates the status of URLs. |
7374
| `DS_APP_SYNC_AGENCY_ADD_TASK_FLAG` | Adds new agencies to the Data Sources App|
7475
| `DS_APP_SYNC_AGENCY_UPDATE_TASK_FLAG` | Updates existing agencies in the Data Sources App|
7576
| `DS_APP_SYNC_AGENCY_DELETE_TASK_FLAG` | Deletes agencies in the Data Sources App|
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
"""Add update_url_status task
2+
3+
Revision ID: 783268bd3daa
4+
Revises: 88ac26c3b025
5+
Create Date: 2025-11-18 09:02:54.985705
6+
7+
"""
8+
from typing import Sequence, Union
9+
10+
from alembic import op
11+
import sqlalchemy as sa
12+
13+
from src.util.alembic_helpers import add_enum_value
14+
15+
# revision identifiers, used by Alembic.
16+
revision: str = '783268bd3daa'
17+
down_revision: Union[str, None] = '88ac26c3b025'
18+
branch_labels: Union[str, Sequence[str], None] = None
19+
depends_on: Union[str, Sequence[str], None] = None
20+
21+
22+
def upgrade() -> None:
23+
add_enum_value(
24+
enum_name="url_status",
25+
enum_value="broken"
26+
)
27+
add_enum_value(
28+
enum_name="task_type",
29+
enum_value="Update URL Status"
30+
)
31+
32+
33+
def downgrade() -> None:
34+
pass

src/api/main.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
from fastapi import FastAPI
77
from pdap_access_manager.access_manager.async_ import AccessManagerAsync
88
from pdap_access_manager.models.auth import AuthInfo
9-
from sqlalchemy.ext.asyncio import create_async_engine
109
from starlette.responses import RedirectResponse
1110

1211
from src.api.endpoints.agencies.routes import agencies_router

src/collectors/enums.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,4 @@ class URLStatus(Enum):
1414
OK = "ok"
1515
ERROR = "error"
1616
DUPLICATE = "duplicate"
17+
BROKEN = "broken"

src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/add/queries/get.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,10 @@
66
from src.core.tasks.scheduled.impl.sync_to_ds.constants import PER_REQUEST_ENTITY_LIMIT
77
from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.add.queries.cte import \
88
DSAppLinkSyncDataSourceAddPrerequisitesCTEContainer
9+
from src.core.tasks.scheduled.impl.sync_to_ds.shared.convert import convert_sm_url_status_to_ds_url_status
910
from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency
1011
from src.db.models.impl.url.core.sqlalchemy import URL
12+
from src.db.models.impl.url.internet_archives.probe.sqlalchemy import URLInternetArchivesProbeMetadata
1113
from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata
1214
from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType
1315
from src.db.queries.base.builder import QueryBuilderBase
@@ -38,6 +40,7 @@ async def run(self, session: AsyncSession) -> AddDataSourcesOuterRequest:
3840
# Required
3941
URL.full_url,
4042
URL.name,
43+
URL.status,
4144
URLRecordType.record_type,
4245
agency_id_cte.c.agency_ids,
4346
# Optional
@@ -56,6 +59,7 @@ async def run(self, session: AsyncSession) -> AddDataSourcesOuterRequest:
5659
URLOptionalDataSourceMetadata.scraper_url,
5760
URLOptionalDataSourceMetadata.access_notes,
5861
URLOptionalDataSourceMetadata.access_types,
62+
URLInternetArchivesProbeMetadata.archive_url,
5963
)
6064
.select_from(
6165
cte.cte
@@ -68,6 +72,10 @@ async def run(self, session: AsyncSession) -> AddDataSourcesOuterRequest:
6872
URLOptionalDataSourceMetadata,
6973
URL.id == URLOptionalDataSourceMetadata.url_id,
7074
)
75+
.outerjoin(
76+
URLInternetArchivesProbeMetadata,
77+
URL.id == URLInternetArchivesProbeMetadata.url_id,
78+
)
7179
.join(
7280
URLRecordType,
7381
URLRecordType.url_id == URL.id,
@@ -110,7 +118,10 @@ async def run(self, session: AsyncSession) -> AddDataSourcesOuterRequest:
110118
scraper_url=mapping[URLOptionalDataSourceMetadata.scraper_url],
111119
access_notes=mapping[URLOptionalDataSourceMetadata.access_notes],
112120
access_types=mapping[URLOptionalDataSourceMetadata.access_types] or [],
113-
url_status=DataSourcesURLStatus.OK
121+
url_status=convert_sm_url_status_to_ds_url_status(
122+
sm_url_status=mapping[URL.status],
123+
),
124+
internet_archives_url=mapping[URLInternetArchivesProbeMetadata.archive_url] or None,
114125
)
115126
)
116127
)

src/core/tasks/scheduled/impl/sync_to_ds/impl/data_sources/update/queries/get.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,10 @@
66
from src.core.tasks.scheduled.impl.sync_to_ds.constants import PER_REQUEST_ENTITY_LIMIT
77
from src.core.tasks.scheduled.impl.sync_to_ds.impl.data_sources.update.queries.cte import \
88
DSAppLinkSyncDataSourceUpdatePrerequisitesCTEContainer
9+
from src.core.tasks.scheduled.impl.sync_to_ds.shared.convert import convert_sm_url_status_to_ds_url_status
910
from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency
1011
from src.db.models.impl.url.core.sqlalchemy import URL
12+
from src.db.models.impl.url.internet_archives.probe.sqlalchemy import URLInternetArchivesProbeMetadata
1113
from src.db.models.impl.url.optional_ds_metadata.sqlalchemy import URLOptionalDataSourceMetadata
1214
from src.db.models.impl.url.record_type.sqlalchemy import URLRecordType
1315
from src.db.queries.base.builder import QueryBuilderBase
@@ -39,6 +41,7 @@ async def run(self, session: AsyncSession) -> UpdateDataSourcesOuterRequest:
3941
# Required
4042
URL.full_url,
4143
URL.name,
44+
URL.status,
4245
URLRecordType.record_type,
4346
agency_id_cte.c.agency_ids,
4447
# Optional
@@ -57,7 +60,8 @@ async def run(self, session: AsyncSession) -> UpdateDataSourcesOuterRequest:
5760
URLOptionalDataSourceMetadata.scraper_url,
5861
URLOptionalDataSourceMetadata.access_notes,
5962
URLOptionalDataSourceMetadata.access_types,
60-
URLOptionalDataSourceMetadata.data_portal_type_other
63+
URLOptionalDataSourceMetadata.data_portal_type_other,
64+
URLInternetArchivesProbeMetadata.archive_url,
6165
)
6266
.select_from(
6367
cte.cte
@@ -70,6 +74,10 @@ async def run(self, session: AsyncSession) -> UpdateDataSourcesOuterRequest:
7074
URLOptionalDataSourceMetadata,
7175
URL.id == URLOptionalDataSourceMetadata.url_id,
7276
)
77+
.outerjoin(
78+
URLInternetArchivesProbeMetadata,
79+
URL.id == URLInternetArchivesProbeMetadata.url_id,
80+
)
7381
.join(
7482
URLRecordType,
7583
URLRecordType.url_id == URL.id,
@@ -113,7 +121,10 @@ async def run(self, session: AsyncSession) -> UpdateDataSourcesOuterRequest:
113121
access_notes=mapping[URLOptionalDataSourceMetadata.access_notes],
114122
access_types=mapping[URLOptionalDataSourceMetadata.access_types] or [],
115123
data_portal_type_other=mapping[URLOptionalDataSourceMetadata.data_portal_type_other],
116-
url_status=DataSourcesURLStatus.OK
124+
url_status=convert_sm_url_status_to_ds_url_status(
125+
sm_url_status=mapping[URL.status],
126+
),
127+
internet_archives_url=mapping[URLInternetArchivesProbeMetadata.archive_url] or None,
117128
)
118129
)
119130
)

src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/add/queries/get.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,10 @@
66
from src.core.tasks.scheduled.impl.sync_to_ds.constants import PER_REQUEST_ENTITY_LIMIT
77
from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.add.queries.cte import \
88
DSAppLinkSyncMetaURLAddPrerequisitesCTEContainer
9+
from src.core.tasks.scheduled.impl.sync_to_ds.shared.convert import convert_sm_url_status_to_ds_url_status
910
from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency
1011
from src.db.models.impl.url.core.sqlalchemy import URL
12+
from src.db.models.impl.url.internet_archives.probe.sqlalchemy import URLInternetArchivesProbeMetadata
1113
from src.db.queries.base.builder import QueryBuilderBase
1214
from src.external.pdap.impl.sync.meta_urls._shared.content import MetaURLSyncContentModel
1315
from src.external.pdap.impl.sync.meta_urls.add.request import AddMetaURLsOuterRequest, AddMetaURLsInnerRequest
@@ -21,7 +23,8 @@ async def run(self, session: AsyncSession) -> AddMetaURLsOuterRequest:
2123
agency_id_cte = (
2224
select(
2325
LinkURLAgency.url_id,
24-
func.array_agg(LinkURLAgency.agency_id).label("agency_ids")
26+
func.array_agg(LinkURLAgency.agency_id).label("agency_ids"),
27+
2528
)
2629
.group_by(
2730
LinkURLAgency.url_id
@@ -33,6 +36,8 @@ async def run(self, session: AsyncSession) -> AddMetaURLsOuterRequest:
3336
select(
3437
cte.url_id,
3538
URL.full_url,
39+
URL.status,
40+
URLInternetArchivesProbeMetadata.archive_url,
3641
agency_id_cte.c.agency_ids
3742
)
3843
.select_from(
@@ -42,6 +47,10 @@ async def run(self, session: AsyncSession) -> AddMetaURLsOuterRequest:
4247
URL,
4348
URL.id == cte.url_id,
4449
)
50+
.outerjoin(
51+
URLInternetArchivesProbeMetadata,
52+
URL.id == URLInternetArchivesProbeMetadata.url_id,
53+
)
4554
.join(
4655
agency_id_cte,
4756
cte.url_id == agency_id_cte.c.url_id
@@ -61,7 +70,11 @@ async def run(self, session: AsyncSession) -> AddMetaURLsOuterRequest:
6170
request_id=mapping[cte.url_id],
6271
content=MetaURLSyncContentModel(
6372
url=mapping["full_url"],
64-
agency_ids=mapping["agency_ids"]
73+
agency_ids=mapping["agency_ids"],
74+
internet_archives_url=mapping[URLInternetArchivesProbeMetadata.archive_url] or None,
75+
url_status=convert_sm_url_status_to_ds_url_status(
76+
sm_url_status=mapping[URL.status],
77+
),
6578
)
6679
)
6780
)

src/core/tasks/scheduled/impl/sync_to_ds/impl/meta_urls/update/queries/get.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,10 @@
66
from src.core.tasks.scheduled.impl.sync_to_ds.constants import PER_REQUEST_ENTITY_LIMIT
77
from src.core.tasks.scheduled.impl.sync_to_ds.impl.meta_urls.update.queries.cte import \
88
DSAppLinkSyncMetaURLUpdatePrerequisitesCTEContainer
9+
from src.core.tasks.scheduled.impl.sync_to_ds.shared.convert import convert_sm_url_status_to_ds_url_status
910
from src.db.models.impl.link.url_agency.sqlalchemy import LinkURLAgency
1011
from src.db.models.impl.url.core.sqlalchemy import URL
12+
from src.db.models.impl.url.internet_archives.probe.sqlalchemy import URLInternetArchivesProbeMetadata
1113
from src.db.queries.base.builder import QueryBuilderBase
1214
from src.external.pdap.impl.sync.meta_urls._shared.content import MetaURLSyncContentModel
1315
from src.external.pdap.impl.sync.meta_urls.update.request import UpdateMetaURLsOuterRequest, UpdateMetaURLsInnerRequest
@@ -33,7 +35,9 @@ async def run(self, session: AsyncSession) -> UpdateMetaURLsOuterRequest:
3335
select(
3436
cte.ds_meta_url_id,
3537
URL.full_url,
36-
agency_id_cte.c.agency_ids
38+
URL.status,
39+
agency_id_cte.c.agency_ids,
40+
URLInternetArchivesProbeMetadata.archive_url,
3741
)
3842
.select_from(
3943
cte.cte
@@ -42,6 +46,10 @@ async def run(self, session: AsyncSession) -> UpdateMetaURLsOuterRequest:
4246
URL,
4347
URL.id == cte.url_id,
4448
)
49+
.outerjoin(
50+
URLInternetArchivesProbeMetadata,
51+
URL.id == URLInternetArchivesProbeMetadata.url_id,
52+
)
4553
.outerjoin(
4654
agency_id_cte,
4755
cte.url_id == agency_id_cte.c.url_id
@@ -61,7 +69,11 @@ async def run(self, session: AsyncSession) -> UpdateMetaURLsOuterRequest:
6169
app_id=mapping[cte.ds_meta_url_id],
6270
content=MetaURLSyncContentModel(
6371
url=mapping['full_url'],
64-
agency_ids=mapping["agency_ids"] or []
72+
agency_ids=mapping["agency_ids"] or [],
73+
internet_archives_url=mapping[URLInternetArchivesProbeMetadata.archive_url] or None,
74+
url_status=convert_sm_url_status_to_ds_url_status(
75+
sm_url_status=mapping[URL.status],
76+
),
6577
)
6678
)
6779
)

src/core/tasks/scheduled/impl/sync_to_ds/shared/__init__.py

Whitespace-only changes.
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
from src.collectors.enums import URLStatus
2+
from src.external.pdap.enums import DataSourcesURLStatus
3+
4+
5+
def convert_sm_url_status_to_ds_url_status(
6+
sm_url_status: URLStatus
7+
) -> DataSourcesURLStatus:
8+
match sm_url_status:
9+
case URLStatus.OK:
10+
return DataSourcesURLStatus.OK
11+
case URLStatus.BROKEN:
12+
return DataSourcesURLStatus.BROKEN
13+
case _:
14+
raise ValueError(f"URL status has no corresponding DS Status: {sm_url_status}")

0 commit comments

Comments
 (0)