Skip to content

Commit 8b09958

Browse files
authored
[stage1] fix: strip YAML frontmatter from TOML integration prompts (#2096)
* fix: correct toml integration frontmatter handling * refactor: reuse frontmatter split in toml integration * fix: preserve toml integration string semantics * docs: align toml integration renderer docstring
1 parent 9c0be46 commit 8b09958

File tree

2 files changed

+154
-46
lines changed

2 files changed

+154
-46
lines changed

src/specify_cli/integrations/base.py

Lines changed: 82 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -532,23 +532,83 @@ def command_filename(self, template_name: str) -> str:
532532
def _extract_description(content: str) -> str:
533533
"""Extract the ``description`` value from YAML frontmatter.
534534
535-
Scans lines between the first pair of ``---`` delimiters for a
536-
top-level ``description:`` key. Returns the value (with
537-
surrounding quotes stripped) or an empty string if not found.
535+
Parses the YAML frontmatter so block scalar descriptions (``|``
536+
and ``>``) keep their YAML semantics instead of being treated as
537+
raw text.
538538
"""
539-
in_frontmatter = False
540-
for line in content.splitlines():
541-
stripped = line.rstrip("\n\r")
542-
if stripped == "---":
543-
if not in_frontmatter:
544-
in_frontmatter = True
545-
continue
546-
break # second ---
547-
if in_frontmatter and stripped.startswith("description:"):
548-
_, _, value = stripped.partition(":")
549-
return value.strip().strip('"').strip("'")
539+
import yaml
540+
541+
frontmatter_text, _ = TomlIntegration._split_frontmatter(content)
542+
if not frontmatter_text:
543+
return ""
544+
try:
545+
frontmatter = yaml.safe_load(frontmatter_text) or {}
546+
except yaml.YAMLError:
547+
return ""
548+
549+
if not isinstance(frontmatter, dict):
550+
return ""
551+
552+
description = frontmatter.get("description", "")
553+
if isinstance(description, str):
554+
return description
550555
return ""
551556

557+
@staticmethod
558+
def _split_frontmatter(content: str) -> tuple[str, str]:
559+
"""Split YAML frontmatter from the remaining content.
560+
561+
Returns ``("", content)`` when no complete frontmatter block is
562+
present. The body is preserved exactly as written so prompt text
563+
keeps its intended formatting.
564+
"""
565+
if not content.startswith("---"):
566+
return "", content
567+
568+
lines = content.splitlines(keepends=True)
569+
if not lines or lines[0].rstrip("\r\n") != "---":
570+
return "", content
571+
572+
frontmatter_end = -1
573+
for i, line in enumerate(lines[1:], start=1):
574+
if line.rstrip("\r\n") == "---":
575+
frontmatter_end = i
576+
break
577+
578+
if frontmatter_end == -1:
579+
return "", content
580+
581+
frontmatter = "".join(lines[1:frontmatter_end])
582+
body = "".join(lines[frontmatter_end + 1 :])
583+
return frontmatter, body
584+
585+
@staticmethod
586+
def _render_toml_string(value: str) -> str:
587+
"""Render *value* as a TOML string literal.
588+
589+
Uses a basic string for single-line values, multiline basic
590+
strings for values containing newlines, and falls back to a
591+
literal string or escaped basic string when delimiters appear in
592+
the content.
593+
"""
594+
if "\n" not in value and "\r" not in value:
595+
escaped = value.replace("\\", "\\\\").replace('"', '\\"')
596+
return f'"{escaped}"'
597+
598+
escaped = value.replace("\\", "\\\\")
599+
if '"""' not in escaped:
600+
return '"""\n' + escaped + '"""'
601+
if "'''" not in value:
602+
return "'''\n" + value + "'''"
603+
604+
return '"' + (
605+
value.replace("\\", "\\\\")
606+
.replace('"', '\\"')
607+
.replace("\n", "\\n")
608+
.replace("\r", "\\r")
609+
.replace("\t", "\\t")
610+
) + '"'
611+
552612
@staticmethod
553613
def _render_toml(description: str, body: str) -> str:
554614
"""Render a TOML command file from description and body.
@@ -558,39 +618,19 @@ def _render_toml(description: str, body: str) -> str:
558618
to multiline literal strings (``'''``) if the body contains
559619
``\"\"\"``, then to an escaped basic string as a last resort.
560620
561-
The body is rstrip'd so the closing delimiter appears on the line
562-
immediately after the last content line — matching the release
563-
script's ``echo "$body"; echo '\"\"\"'`` pattern.
621+
The body is ``rstrip("\\n")``'d before rendering, so the TOML
622+
value preserves content without forcing a trailing newline. As a
623+
result, multiline delimiters appear on their own line only when
624+
the rendered value itself ends with a newline.
564625
"""
565626
toml_lines: list[str] = []
566627

567628
if description:
568-
desc = description.replace('"', '\\"')
569-
toml_lines.append(f'description = "{desc}"')
629+
toml_lines.append(f"description = {TomlIntegration._render_toml_string(description)}")
570630
toml_lines.append("")
571631

572632
body = body.rstrip("\n")
573-
574-
# Escape backslashes for basic multiline strings.
575-
escaped = body.replace("\\", "\\\\")
576-
577-
if '"""' not in escaped:
578-
toml_lines.append('prompt = """')
579-
toml_lines.append(escaped)
580-
toml_lines.append('"""')
581-
elif "'''" not in body:
582-
toml_lines.append("prompt = '''")
583-
toml_lines.append(body)
584-
toml_lines.append("'''")
585-
else:
586-
escaped_body = (
587-
body.replace("\\", "\\\\")
588-
.replace('"', '\\"')
589-
.replace("\n", "\\n")
590-
.replace("\r", "\\r")
591-
.replace("\t", "\\t")
592-
)
593-
toml_lines.append(f'prompt = "{escaped_body}"')
633+
toml_lines.append(f"prompt = {TomlIntegration._render_toml_string(body)}")
594634

595635
return "\n".join(toml_lines) + "\n"
596636

@@ -630,7 +670,8 @@ def setup(
630670
raw = src_file.read_text(encoding="utf-8")
631671
description = self._extract_description(raw)
632672
processed = self.process_template(raw, self.key, script_type, arg_placeholder)
633-
toml_content = self._render_toml(description, processed)
673+
_, body = self._split_frontmatter(processed)
674+
toml_content = self._render_toml(description, body)
634675
dst_name = self.command_filename(src_file.stem)
635676
dst_file = self.write_file_and_record(
636677
toml_content, dest / dst_name, project_root, manifest

tests/integrations/test_integration_base_toml.py

Lines changed: 72 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@
99
"""
1010

1111
import os
12+
import tomllib
13+
14+
import pytest
1215

1316
from specify_cli.integrations import INTEGRATION_REGISTRY, get_integration
1417
from specify_cli.integrations.base import TomlIntegration
@@ -132,13 +135,77 @@ def test_toml_uses_correct_arg_placeholder(self, tmp_path):
132135
has_args = any("{{args}}" in f.read_text(encoding="utf-8") for f in cmd_files)
133136
assert has_args, "No TOML command file contains {{args}} placeholder"
134137

138+
@pytest.mark.parametrize(
139+
("frontmatter", "expected"),
140+
[
141+
(
142+
"---\ndescription: |\n First line\n Second line\n---\nBody\n",
143+
"First line\nSecond line\n",
144+
),
145+
(
146+
"---\ndescription: >\n First line\n Second line\n---\nBody\n",
147+
"First line Second line\n",
148+
),
149+
(
150+
"---\ndescription: |-\n First line\n Second line\n---\nBody\n",
151+
"First line\nSecond line",
152+
),
153+
(
154+
"---\ndescription: >-\n First line\n Second line\n---\nBody\n",
155+
"First line Second line",
156+
),
157+
],
158+
)
159+
def test_toml_extract_description_supports_block_scalars(self, frontmatter, expected):
160+
assert TomlIntegration._extract_description(frontmatter) == expected
161+
162+
def test_split_frontmatter_ignores_indented_delimiters(self):
163+
content = (
164+
"---\n"
165+
"description: |\n"
166+
" line one\n"
167+
" ---\n"
168+
" line two\n"
169+
"---\n"
170+
"Body\n"
171+
)
172+
173+
frontmatter, body = TomlIntegration._split_frontmatter(content)
174+
175+
assert "line two" in frontmatter
176+
assert body == "Body\n"
177+
178+
def test_toml_prompt_excludes_frontmatter(self, tmp_path, monkeypatch):
179+
i = get_integration(self.KEY)
180+
template = tmp_path / "sample.md"
181+
template.write_text(
182+
"---\n"
183+
"description: Summary line one\n"
184+
"scripts:\n"
185+
" sh: scripts/bash/example.sh\n"
186+
"---\n"
187+
"Body line one\n"
188+
"Body line two\n",
189+
encoding="utf-8",
190+
)
191+
monkeypatch.setattr(i, "list_command_templates", lambda: [template])
192+
193+
m = IntegrationManifest(self.KEY, tmp_path)
194+
created = i.setup(tmp_path, m)
195+
cmd_files = [f for f in created if "scripts" not in f.parts]
196+
assert len(cmd_files) == 1
197+
198+
generated = cmd_files[0].read_text(encoding="utf-8")
199+
parsed = tomllib.loads(generated)
200+
201+
assert parsed["description"] == "Summary line one"
202+
assert parsed["prompt"] == "Body line one\nBody line two"
203+
assert "description:" not in parsed["prompt"]
204+
assert "scripts:" not in parsed["prompt"]
205+
assert "---" not in parsed["prompt"]
206+
135207
def test_toml_is_valid(self, tmp_path):
136208
"""Every generated TOML file must parse without errors."""
137-
try:
138-
import tomllib
139-
except ModuleNotFoundError:
140-
import tomli as tomllib # type: ignore[no-redef]
141-
142209
i = get_integration(self.KEY)
143210
m = IntegrationManifest(self.KEY, tmp_path)
144211
created = i.setup(tmp_path, m)

0 commit comments

Comments
 (0)