From bd4780fe60c5b166c7e8f2323eb371c0f0c4a0fe Mon Sep 17 00:00:00 2001 From: Adam Harvey Date: Fri, 13 Dec 2024 14:22:10 -0800 Subject: [PATCH] db-dump: set sequence values when importing a database dump By default, the import script recreates the database schema, which includes creating new sequences with zero values. This results in the lazy crates.io developer occasionally receiving obscure errors when inserting records into tables that use sequences, often not on the first or second insert due to IDs in the database dump not always being continuous. Rather than dumping the real sequence values from the database, we can just recreate them based on the maximum ID in each table. Works well enough, and means we don't have to tinker with the export script or ship extra data. This commit only configures the database tables that actually include data in the database dump. There are other sequences, but since those tables won't have data imported, it doesn't matter if they remain zero after import. --- .../src/configuration.rs | 13 ++++++++++-- .../crates_io_database_dump/src/dump-db.toml | 21 +++++++++++++++++++ .../src/dump-import.sql.j2 | 13 ++++++++++++ .../src/gen_scripts.rs | 4 +++- 4 files changed, 48 insertions(+), 3 deletions(-) diff --git a/crates/crates_io_database_dump/src/configuration.rs b/crates/crates_io_database_dump/src/configuration.rs index 6c11b8eddd2..8e310e7fb1e 100644 --- a/crates/crates_io_database_dump/src/configuration.rs +++ b/crates/crates_io_database_dump/src/configuration.rs @@ -1,4 +1,4 @@ -use serde::Deserialize; +use serde::{Deserialize, Serialize}; use std::collections::{BTreeMap, VecDeque}; /// An enum indicating whether a column is included in the database dumps. @@ -15,7 +15,9 @@ pub enum ColumnVisibility { /// and should list all tables the current tables refers to with foreign key /// constraints on public columns. The `filter` field is a valid SQL expression /// used in a `WHERE` clause to filter the rows of the table. The `columns` -/// field maps column names to their respective visibilities. +/// field maps column names to their respective visibilities. The `sequence` +/// field, if present, defines the sequence used by the table when generating +/// IDs, along with the ID column. #[derive(Clone, Debug, Default, Deserialize)] pub struct TableConfig { #[serde(default)] @@ -24,6 +26,13 @@ pub struct TableConfig { pub columns: BTreeMap, #[serde(default)] pub column_defaults: BTreeMap, + pub sequence: Option, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub struct SequenceConfig { + pub column: String, + pub name: String, } /// Maps table names to the respective configurations. Used to load `dump_db.toml`. diff --git a/crates/crates_io_database_dump/src/dump-db.toml b/crates/crates_io_database_dump/src/dump-db.toml index 90d3969cf5d..c14ed2b930e 100644 --- a/crates/crates_io_database_dump/src/dump-db.toml +++ b/crates/crates_io_database_dump/src/dump-db.toml @@ -48,6 +48,9 @@ description = "public" crates_cnt = "public" created_at = "public" path = "public" +[categories.sequence] +column = "id" +name = "categories_id_seq" [crate_downloads.columns] crate_id = "public" @@ -87,6 +90,9 @@ textsearchable_index_col = "private" # This Postgres specific and can be derived repository = "public" max_upload_size = "public" max_features = "public" +[crates.sequence] +column = "id" +name = "packages_id_seq" [crates_categories] dependencies = ["categories", "crates"] @@ -130,6 +136,9 @@ features = "public" target = "public" kind = "public" explicit_name = "public" +[dependencies.sequence] +column = "id" +name = "dependencies_id_seq" [__diesel_schema_migrations.columns] version = "private" @@ -152,6 +161,9 @@ id = "public" keyword = "public" crates_cnt = "public" created_at = "public" +[keywords.sequence] +column = "id" +name = "keywords_id_seq" [metadata.columns] total_downloads = "public" @@ -186,6 +198,9 @@ github_id = "public" name = "public" avatar = "public" org_id = "public" +[teams.sequence] +column = "id" +name = "teams_id_seq" [users] filter = """ @@ -207,6 +222,9 @@ is_admin = "private" publish_notifications = "private" [users.column_defaults] gh_access_token = "''" +[users.sequence] +column = "id" +name = "users_id_seq" [version_downloads] dependencies = ["versions"] @@ -253,6 +271,9 @@ documentation = "public" repository = "public" categories = "public" keywords = "public" +[versions.sequence] +column = "id" +name = "versions_id_seq" [versions_published_by.columns] version_id = "private" diff --git a/crates/crates_io_database_dump/src/dump-import.sql.j2 b/crates/crates_io_database_dump/src/dump-import.sql.j2 index ad79cb69559..6b451fdcbc1 100644 --- a/crates/crates_io_database_dump/src/dump-import.sql.j2 +++ b/crates/crates_io_database_dump/src/dump-import.sql.j2 @@ -29,6 +29,19 @@ BEGIN; {% for cd in table.column_defaults %} ALTER TABLE "{{table.name}}" ALTER COLUMN "{{cd.column}}" DROP DEFAULT; {%- endfor %} +{%- endfor %} + + -- Set sequence values. +{% for table in tables -%} +{% if table.sequence %} + SELECT setval( + '{{table.sequence.name}}', + COALESCE( + (SELECT MAX("{{table.sequence.column}}") FROM "{{table.name}}")::BIGINT, + 1 + ) + ); +{% endif %} {%- endfor %} -- Reenable triggers on each table. diff --git a/crates/crates_io_database_dump/src/gen_scripts.rs b/crates/crates_io_database_dump/src/gen_scripts.rs index 3d6224eb974..e1f3383ec9c 100644 --- a/crates/crates_io_database_dump/src/gen_scripts.rs +++ b/crates/crates_io_database_dump/src/gen_scripts.rs @@ -1,4 +1,4 @@ -use crate::configuration::{ColumnVisibility, TableConfig, VisibilityConfig}; +use crate::configuration::{ColumnVisibility, SequenceConfig, TableConfig, VisibilityConfig}; use anyhow::Context; use serde::Serialize; use std::{fs::File, path::Path}; @@ -18,6 +18,7 @@ struct HandlebarsTableContext<'a> { filter: Option, columns: String, column_defaults: Vec>, + sequence: Option<&'a SequenceConfig>, } #[derive(Debug, Serialize)] @@ -52,6 +53,7 @@ impl TableConfig { filter, columns, column_defaults, + sequence: self.sequence.as_ref(), }) } }