diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000..09b14c2b --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "img2mol/clean_img2smiles"] + path = img2mol/clean_img2smiles + url = https://github.com/carbonsilicon-ai/clean_img2smiles.git diff --git a/README_molecule_testing.md b/README_molecule_testing.md new file mode 100644 index 00000000..63585425 --- /dev/null +++ b/README_molecule_testing.md @@ -0,0 +1,204 @@ +# 分子识别Extraction测试脚本使用说明 + +这些测试脚本用于测试marker_main的extraction功能,特别是集成了img2mol的分子识别功能。**现在支持Mock模式,可以在没有img2mol依赖的情况下快速测试分子标签生成!** + +## 文件说明 + +1. **test_molecule_extraction.py** - 基础测试脚本,支持Mock模式 +2. **test_molecule_extraction_enhanced.py** - 增强版测试脚本,推荐使用,支持Mock模式 +3. **README_molecule_testing.md** - 本说明文档 + +## 🎭 Mock模式 vs 🔬 真实模式 + +### Mock模式(默认) +- ✅ **无需安装img2mol依赖** +- ✅ **快速测试**,几秒钟完成 +- ✅ **固定输出**:`c1ccccc1` 和 `placeholder` +- ✅ **随机生成**分子位置和数量 +- ✅ **智能转换**部分表格为分子表格 +- 🎯 适合测试分子标签生成逻辑 + +### 真实模式 +- 📚 需要安装完整的img2mol依赖 +- ⚡ 需要GPU支持(推荐) +- 🎯 真实的分子检测和识别 +- 🔍 实际的化学结构分析 + +## 快速开始 + +### 1. 环境准备 + +```bash +# 基础依赖(必需) +pip install -r requirements.txt + +# img2mol依赖(仅真实模式需要) +# pip install img2mol依赖... +``` + +### 2. 准备测试PDF + +找一个PDF文件(任何PDF都可以,Mock模式会自动生成分子内容): + +### 3. 运行测试 + +```bash +# 🎭 Mock模式测试(推荐,快速测试) +python test_molecule_extraction.py +python test_molecule_extraction_enhanced.py + +# 🔬 真实模型测试(需要img2mol) +python test_molecule_extraction.py --real +python test_molecule_extraction_enhanced.py --real + +# 📊 综合测试 +python test_molecule_extraction_enhanced.py --comprehensive +python test_molecule_extraction_enhanced.py --comprehensive --real +``` + +## 测试输出 + +### Mock模式输出示例 +``` +🎭 运行模式: Mock测试模式 (快速测试) + - 将生成假的分子检测数据 + - 输出固定内容: c1ccccc1placeholder + - 不需要img2mol依赖 + +📄 页面 1: Mock生成 3 个分子, 1 个分子表格 +📄 页面 2: Mock生成 2 个分子, 0 个分子表格 + +📊 提取结果分析: + - 分子结构数量: 5 + - 分子表格数量: 1 + - 文本中标签: 5 + - 文本中标签: 1 +``` + +### 生成的文件 +1. **Markdown文件** - 包含分子标签的转换结果 + ``` + {doc_id}_enhanced_molecules.md + ``` + +2. **元数据文件** - 详细的处理统计信息 + ``` + {doc_id}_enhanced_metadata.json + ``` + +3. **配置文件** - 使用的配置参数 + ``` + {doc_id}_config.json + ``` + +## 分子标签说明 + +### Mock模式输出 +- `c1ccccc1` - 苯环的SMILES表示 +- `placeholder` - 分子表格占位符 + +### 真实模式输出 +- `实际的分子结构` - 检测到的真实分子结构 +- `表格内容` - 检测到的分子表格内容 + +## 配置选项 + +### Mock模式配置 +```python +processor_config = { + 'use_mock_data': True, # 启用Mock模式 + 'mock_mode': True, # 别名 + 'debug': True, # 调试输出 + # 其他配置在Mock模式下会被忽略 +} +``` + +### 真实模式配置 +```python +processor_config = { + 'use_mock_data': False, # 禁用Mock模式 + 'device': 'cuda', # 设备:'cuda' 或 'cpu' + 'with_mol_detect': True, # 启用分子检测 + 'with_table_detect': True, # 启用表格检测 + 'use_yolo_mol_model': True, # 使用YOLO分子模型 + 'use_yolo_table_model': True, # 使用YOLO表格模型 + 'debug': True, # 调试模式 + 'num_workers': 1, # 工作线程数 + # 更多img2mol配置... +} +``` + +## 常见问题 + +### 1. Mock模式相关 + +**Q: Mock模式生成的分子数量是固定的吗?** +A: 不是,每页随机生成2-4个分子,30-50%的表格会被转换为分子表格。 + +**Q: 可以自定义Mock输出内容吗?** +A: 目前输出固定的`c1ccccc1`(苯环)和`placeholder`,可以修改代码中的mock数据。 + +**Q: Mock模式会替换真实的表格吗?** +A: 是的,Mock模式会智能地将部分检测到的表格转换为分子表格,演示替换逻辑。 + +### 2. 模式切换 + +**Q: 如何在代码中控制使用哪种模式?** +A: 通过命令行参数:默认Mock模式,使用`--real`启用真实模式。 + +**Q: 可以在同一次运行中测试两种模式吗?** +A: 需要分别运行,但可以使用综合测试脚本依次测试。 + +### 3. 性能对比 + +| 特性 | Mock模式 | 真实模式 | +|------|----------|----------| +| 速度 | 🚀 极快(秒级) | ⏳ 较慢(分钟级) | +| 依赖 | 📦 少 | 📚 多 | +| 准确性 | 🎭 模拟 | 🔬 真实 | +| 用途 | 🧪 功能测试 | 🎯 实际应用 | + +## 故障排除 + +### Mock模式问题 +1. **没有生成分子标签** - 检查PDF是否有内容,Mock模式需要页面结构 +2. **Mock模式失效** - 检查`use_mock_data`配置是否正确传递 + +### 真实模式问题 +1. **模型加载失败** - 检查img2mol依赖和模型文件 +2. **GPU内存不足** - 切换到CPU模式或减少工作线程 + +### 通用问题 +1. **PDF文件无法读取** - 确保路径正确且文件存在 +2. **输出目录权限** - 确保有写权限 + +## 开发建议 + +1. **🎭 先用Mock模式测试** - 验证基础功能和标签生成 +2. **🔬 再用真实模式验证** - 确认实际分子检测效果 +3. **📊 使用综合测试** - 批量测试不同配置 +4. **🐛 调试时启用debug** - 获取详细日志 + +## 示例脚本 + +```bash +# 快速验证功能 +python test_molecule_extraction_enhanced.py + +# 完整测试流程 +python test_molecule_extraction_enhanced.py --comprehensive + +# 真实模型测试 +python test_molecule_extraction_enhanced.py --real + +# 查看帮助 +python test_molecule_extraction_enhanced.py --help +``` + +## 联系支持 + +如果测试脚本无法正常工作,请提供: +- 错误信息完整日志 +- 使用的PDF文件类型和大小 +- 系统配置(GPU/CPU,内存等) +- Python环境信息 \ No newline at end of file diff --git a/app.sh b/app.sh new file mode 100755 index 00000000..0e731bda --- /dev/null +++ b/app.sh @@ -0,0 +1,4 @@ +#!/bin/bash +source /opt/conda/etc/profile.d/conda.sh # 假设 Conda 安装在 /opt/conda +conda activate myenv +python3 run_app.py \ No newline at end of file diff --git a/data/molecule.pdf b/data/molecule.pdf new file mode 100644 index 00000000..238317a1 Binary files /dev/null and b/data/molecule.pdf differ diff --git a/dockerfile b/dockerfile new file mode 100644 index 00000000..a133bd3b --- /dev/null +++ b/dockerfile @@ -0,0 +1,8 @@ +FROM drugflow_marker:base + +RUN pip install marker-pdf==1.6.2 -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple +# 删除marker-pdf自动安装的torch等依赖,释放空间 +RUN pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu118 + +RUN pip install python-docx python-pptx -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple + diff --git a/dockerfile2 b/dockerfile2 new file mode 100644 index 00000000..af840cae --- /dev/null +++ b/dockerfile2 @@ -0,0 +1,14 @@ +FROM marker_with_mol:v5 + +RUN apt-get install -y \ + fonts-noto-cjk \ + fonts-wqy-zenhei \ + fonts-wqy-microhei \ + fonts-arphic-ukai \ + fonts-arphic-uming \ + fontconfig + +RUN conda run -n myenv conda install -y -c conda-forge cairo pango gdk-pixbuf libffi +RUN conda run -n myenv pip install boto3 -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple +RUN conda run -n myenv pip install weasyprint mammoth -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple + diff --git a/dockerfile_0616_maker_with_mol_v5 b/dockerfile_0616_maker_with_mol_v5 new file mode 100644 index 00000000..6915efed --- /dev/null +++ b/dockerfile_0616_maker_with_mol_v5 @@ -0,0 +1,18 @@ +FROM marker_with_mol:v4 + +RUN set -xe +# 设置非交互式环境变量(避免 apt 和 pip 的交互提示) +ENV DEBIAN_FRONTEND=noninteractive \ + PYTHONUNBUFFERED=1 \ + PIP_NO_CACHE_DIR=off \ + PIP_DISABLE_PIP_VERSION_CHECK=on + +# 安装系统依赖(poppler-utils) +RUN apt-get update && \ + apt-get install -y --no-install-recommends poppler-utils && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN apt-get install poppler-utils +RUN /opt/conda/envs/myenv/bin/pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu118 +RUN /opt/conda/envs/myenv/bin/pip install onnx==1.16.1 onnxruntime==1.16.3 onnxruntime-gpu==1.16.3 -i https://pypi.tuna.tsinghua.edu.cn/simple/ \ No newline at end of file diff --git a/img2mol/clean_img2smiles b/img2mol/clean_img2smiles new file mode 160000 index 00000000..a2f3a419 --- /dev/null +++ b/img2mol/clean_img2smiles @@ -0,0 +1 @@ +Subproject commit a2f3a419e3dbd70ff8c1ba25764675c954015033 diff --git a/marker/builders/document.py b/marker/builders/document.py index e87ba001..97f07411 100644 --- a/marker/builders/document.py +++ b/marker/builders/document.py @@ -9,6 +9,11 @@ from marker.schema.document import Document from marker.schema.groups.page import PageGroup from marker.schema.registry import get_block_class +from marker.utils import send_callback, flush_cuda_memory +from datetime import datetime +import pytz +# 获取北京时区 +beijing_tz = pytz.timezone('Asia/Shanghai') class DocumentBuilder(BaseBuilder): @@ -23,28 +28,61 @@ class DocumentBuilder(BaseBuilder): int, "DPI setting for high-resolution page images used for OCR.", ] = 192 + original_image_dpi: Annotated[ + int, + "DPI setting for original-resolution page images for high-quality processing.", + ] = 300 disable_ocr: Annotated[ bool, "Disable OCR processing.", ] = False - def __call__(self, provider: PdfProvider, layout_builder: LayoutBuilder, line_builder: LineBuilder, ocr_builder: OcrBuilder): + def __call__(self, provider: PdfProvider, layout_builder: LayoutBuilder, line_builder: LineBuilder, ocr_builder: OcrBuilder, callback_url: str | None = None, docId: str | None = None, second_layout_builder = None): document = self.build_document(provider) + flush_cuda_memory() + time_str = datetime.now(beijing_tz).strftime("%H:%M:%S") + send_callback(callback_url, { + 'status': True, + 'messages': 'success', + 'docId': docId, + 'progress': 21, + 'progress_text': '完成文字Detection ' + time_str + }) + layout_builder(document, provider) + # 如果有第二个layout_builder(分子识别),则调用第二个layout_builder + if second_layout_builder: + second_layout_builder(document, provider) + flush_cuda_memory() + line_builder(document, provider) + + flush_cuda_memory() + time_str = datetime.now(beijing_tz).strftime("%H:%M:%S") + send_callback(callback_url, { + 'status': True, + 'messages': 'success', + 'docId': docId, + 'progress': 42, + 'progress_text': '完成Layout解析 ' + time_str + }) + if not self.disable_ocr: ocr_builder(document, provider) + return document def build_document(self, provider: PdfProvider): PageGroupClass: PageGroup = get_block_class(BlockTypes.Page) lowres_images = provider.get_images(provider.page_range, self.lowres_image_dpi) highres_images = provider.get_images(provider.page_range, self.highres_image_dpi) + original_images = provider.get_images(provider.page_range, self.original_image_dpi) initial_pages = [ PageGroupClass( page_id=p, lowres_image=lowres_images[i], highres_image=highres_images[i], + original_image=original_images[i], polygon=provider.get_page_bbox(p), refs=provider.get_page_refs(p) ) for i, p in enumerate(provider.page_range) diff --git a/marker/builders/molecule_layout.py b/marker/builders/molecule_layout.py new file mode 100644 index 00000000..c221a983 --- /dev/null +++ b/marker/builders/molecule_layout.py @@ -0,0 +1,751 @@ +from typing import Annotated, List, Dict, Any, Optional +import random + +from marker.builders import BaseBuilder +from marker.providers.pdf import PdfProvider +from marker.schema.document import Document +from marker.schema.groups import PageGroup +from marker.schema.blocks import Molecule, MoleculeTable +from marker.schema.polygon import PolygonBox +from marker.schema import BlockTypes + +import copy +import sys +import traceback +from PIL import Image +from tqdm import tqdm +import os +import warnings +import io + +# Try to import img2mol processor +IMG2MOL_AVAILABLE = True +# Suppress warnings +warnings.filterwarnings("ignore") + + +class MoleculeLayoutBuilder(BaseBuilder): + """ + A builder for performing chemical molecule layout detection on PDF pages and merging the results into the document. + Uses img2mol's Parser_Processer for molecule and table detection, or mock data for testing. + 采用单例模式来避免重复创建Parser_Processer实例,减少内存泄漏。 + """ + # 单例相关的类变量 + _instance = None + _processor_cache = {} # 用于缓存不同配置的processor实例 + + # The overlap threshold for replacing existing blocks with molecule blocks + overlap_threshold: float = 0.9 + + # The overlap threshold for replacing table blocks with molecule table blocks + table_overlap_threshold: float = 0.9 + + # Whether to disable the tqdm progress bar + disable_tqdm: bool = False + + # Whether to use mock data instead of real img2mol detection + use_mock_data: bool = False + + def __new__(cls, processor_config=None, config=None): + """实现单例模式,确保相同配置只有一个实例""" + # 创建配置的哈希键 + config_key = cls._create_config_key(processor_config, config) + + if config_key not in cls._processor_cache: + instance = super(MoleculeLayoutBuilder, cls).__new__(cls) + cls._processor_cache[config_key] = instance + print(f"🆕 Created new MoleculeLayoutBuilder instance for config: {config_key}") + else: + print(f"♻️ Reusing existing MoleculeLayoutBuilder instance for config: {config_key}") + + return cls._processor_cache[config_key] + + @classmethod + def _create_config_key(cls, processor_config, config): + """创建配置的唯一键""" + # 提取关键配置参数创建哈希 + import hashlib + import json + + key_params = {} + + if processor_config: + # 只选择影响模型加载的关键参数 + key_params.update({ + 'device': processor_config.get('device', 'cuda'), + 'with_mol_detect': processor_config.get('with_mol_detect', True), + 'with_table_detect': processor_config.get('with_table_detect', True), + 'use_yolo_mol_model': processor_config.get('use_yolo_mol_model', True), + 'use_yolo_table_model': processor_config.get('use_yolo_table_model', True), + 'use_got_ocr_model': processor_config.get('use_got_ocr_model', True), + 'model_dir': processor_config.get('model_dir', 'default') + }) + + if config: + key_params.update({ + 'use_molecule_detection': config.get('use_molecule_detection', False) + }) + + # 创建哈希 + config_str = json.dumps(key_params, sort_keys=True) + return hashlib.md5(config_str.encode()).hexdigest()[:8] + + @classmethod + def clear_cache(cls): + """清理所有缓存的实例""" + print("🧹 Clearing MoleculeLayoutBuilder cache...") + for config_key, instance in cls._processor_cache.items(): + if hasattr(instance, 'cleanup_memory'): + instance.cleanup_memory() + cls._processor_cache.clear() + print("✅ MoleculeLayoutBuilder cache cleared") + + def __init__(self, processor_config=None, config=None): + """ + 初始化分子识别Layout Builder + + Args: + processor_config: img2mol Parser_Processer的配置参数 + config: marker配置 + """ + # 防止重复初始化 + if hasattr(self, '_initialized'): + print("♻️ MoleculeLayoutBuilder already initialized, skipping...") + return + + super().__init__(config) + + self.processor_config = processor_config or {} + self.processor = None + self._initialized = True + + # 检查是否使用mock模式 + self.use_mock_data = ( + self.processor_config.get('use_mock_data', False) or + not IMG2MOL_AVAILABLE or + self.processor_config.get('mock_mode', False) + ) + + if not self.use_mock_data: + self._initialize_processor() + else: + print("🎭 使用Mock模式进行分子检测测试") + + def _initialize_processor(self): + """Initialize the img2mol Parser_Processer""" + try: + # Import img2mol processor + import sys + sys.path.append('/app/img2mol') + sys.path.append('/app/img2mol/clean_img2smiles/src') + from img2smiles.pipeline.Processer import Parser_Processer + + # Create processor with configuration + self.processor = Parser_Processer(**self.processor_config) + + print("✅ Successfully initialized img2mol Parser_Processer") + + except Exception as e: + traceback.print_exc() + print(f"Warning: Failed to initialize img2mol processor: {e}") + print("🎭 切换到Mock模式") + self.use_mock_data = True + self.processor = None + + def cleanup_memory(self): + """清理内存""" + try: + if self.processor and hasattr(self.processor, 'cleanup_memory'): + self.processor.cleanup_memory() + + self.processor = None + + # 从全局模型管理器导入并清理 + try: + import sys + sys.path.append('/app/img2mol') + sys.path.append('/app/img2mol/clean_img2smiles/src') + from img2smiles.pipeline.model_manager import model_manager + model_manager.print_memory_stats() + except: + pass + + print("✅ MoleculeLayoutBuilder memory cleaned up") + + except Exception as e: + print(f"⚠️ Warning during MoleculeLayoutBuilder cleanup: {e}") + + def __del__(self): + """析构函数""" + try: + self.cleanup_memory() + except: + pass + + def __call__(self, document: Document, provider: PdfProvider): + """Process all pages in the document to detect molecules and tables""" + if self.use_mock_data: + detection_results = self.generate_mock_detection_results(document.pages) + else: + if self.processor is None: + print("Molecule processor not available, skipping molecule detection") + return + detection_results = self.detect_molecules_and_tables(document.pages) + + self.merge_molecule_blocks_to_pages(document.pages, detection_results) + + def generate_mock_detection_results(self, pages: List[PageGroup]) -> List[dict]: + """ + 生成Mock检测结果用于测试 + 随机生成一些分子结构,基于已有表格生成分子表格(坐标微调用于测试覆盖功能) + """ + results = [] + + for page_idx, page in enumerate(tqdm(pages, disable=self.disable_tqdm, desc="Mock分子检测")): + molecules = [] + tables = [] + + # 获取页面尺寸(用于生成合理的bbox) + page_width = 800 # 默认值 + page_height = 1000 # 默认值 + + if hasattr(page, 'page_image') and page.page_image: + if hasattr(page.page_image, 'size'): + page_width, page_height = page.page_image.size + elif hasattr(page.page_image, 'shape'): + page_height, page_width = page.page_image.shape[:2] + + # 随机生成2-4个分子结构 + num_molecules = random.randint(2, 4) + for _ in range(num_molecules): + # 生成随机bbox + x1 = random.randint(50, page_width - 200) + y1 = random.randint(50, page_height - 200) + x2 = x1 + random.randint(80, 150) + y2 = y1 + random.randint(80, 150) + + molecules.append({ + 'bbox': [x1, y1, x2, y2], + 'confidence': random.uniform(0.8, 0.95), + 'data': { + 'bbox': [x1, y1, x2, y2], + 'smiles': 'c1ccccc1', # 苯环的SMILES + 'mock': True + } + }) + + # 只基于已有的Table blocks生成分子表格mock数据(坐标微调,内容替换) + existing_tables = [] + if hasattr(page, 'children'): + existing_tables = [b for b in page.children if hasattr(b, 'block_type') and b.block_type == BlockTypes.Table] + elif hasattr(page, 'blocks'): + existing_tables = [b for b in page.blocks if b.block_type == BlockTypes.Table] + + if existing_tables: + print(f"📋 页面 {page_idx + 1}: 发现 {len(existing_tables)} 个已有表格,将生成对应的分子表格mock数据") + + for table_block in existing_tables: + # 获取原始表格的坐标 + original_bbox = table_block.polygon.bbox + x1, y1, x2, y2 = original_bbox + + # 微调坐标(稍微偏移,确保有足够重叠来触发替换) + # 偏移范围:-5到+5像素,确保90%以上重叠 + offset_x = random.randint(-5, 5) + offset_y = random.randint(-5, 5) + + adjusted_bbox = [ + x1 + offset_x, + y1 + offset_y, + x2 + offset_x, + y2 + offset_y + ] + + # 生成HTML格式的分子表格内容 + mock_html_table = self._generate_mock_molecule_table_html() + + tables.append({ + 'bbox': adjusted_bbox, + 'confidence': random.uniform(0.85, 0.95), + 'data': { + 'bbox': adjusted_bbox, + 'original_bbox': original_bbox, # 保存原始坐标用于调试 + 'table_type': 'molecule_table', + 'html_content': mock_html_table, + 'format': 'html', + 'mock': True, + 'source': 'existing_table_adjusted' # 标记数据来源 + } + }) + else: + print(f"📋 页面 {page_idx + 1}: 未发现已有表格,跳过分子表格生成") + + results.append({ + 'page_idx': page_idx, + 'molecules': molecules, + 'tables': tables + }) + + print(f"📄 页面 {page_idx + 1}: Mock生成 {len(molecules)} 个分子, {len(tables)} 个分子表格 (基于已有表格)") + + return results + + def _generate_mock_molecule_table_html(self): + """ + 生成Mock分子表格的HTML内容 + 包含化学分子结构数据,cell里填入C1CCCCC1等SMILES + """ + # 定义一些常见的分子SMILES + molecules = [ + 'C1CCCCC1', # 环己烷 + 'c1ccccc1', # 苯 + 'CCO', # 乙醇 + 'CC(=O)O', # 乙酸 + 'CC(C)C', # 异丙烷 + 'C1=CC=CC=C1O', # 苯酚 + 'CCN', # 乙胺 + 'C1CCC(CC1)O' # 环己醇 + ] + + # 随机选择表格大小(2-4行,2-3列) + rows = random.randint(2, 4) + cols = random.randint(2, 3) + + # 构建HTML表格 + html_parts = [''] + + # 表头 + html_parts.append('') + headers = ['化合物', 'SMILES', '分子量'] if cols == 3 else ['化合物', 'SMILES'] + for header in headers[:cols]: + html_parts.append(f'') + html_parts.append('') + + # 数据行 + for i in range(rows): + html_parts.append('') + mol_smiles = random.choice(molecules) + + for j in range(cols): + if j == 0: # 化合物名称列 + content = f'化合物-{i+1}' + elif j == 1: # SMILES列 + content = mol_smiles + else: # 分子量列 + content = f'{random.randint(50, 300)}.{random.randint(10, 99)}' + + html_parts.append(f'') + + html_parts.append('') + + html_parts.append('
{header}
{content}
') + + return ''.join(html_parts) + + def detect_molecules_and_tables(self, pages: List[PageGroup]) -> List[dict]: + """ + Detect molecules and tables on each page using img2mol _prediction_from_pdf + + Returns: + List of detection results for each page + """ + results = [] + + for page_idx, page in enumerate(tqdm(pages, disable=self.disable_tqdm, desc="Detecting molecules")): + try: + # Get page image + page_image = page.get_image(highres=True) + if page_image is None: + print(f"Warning: No image available for page {page_idx}") + results.append({'page_idx': page_idx, 'molecules': [], 'tables': []}) + continue + + # Convert to PIL Image if needed + if not isinstance(page_image, Image.Image): + if hasattr(page_image, 'image'): + page_image = page_image.image + else: + page_image = Image.fromarray(page_image) + + # Use _prediction_from_pdf method for comprehensive detection + # This method handles both molecule and table detection in one call + + # Set up parameters for _prediction_from_pdf + with_mol_detect = self.processor_config.get('with_mol_detect', True) + with_table_detect = self.processor_config.get('with_table_detect', True) + + if with_mol_detect or with_table_detect: + # Call _prediction_from_pdf with image input + # Returns: total_result_dict, total_table_result_dict (if with_table=True) + # or just total_result_dict (if with_table=False) + prediction_result = self.processor._prediction_from_pdf( + image=page_image, + page_idx_list=[page_idx + 1], # _prediction_from_pdf expects 1-based page indices + with_tta=True, + with_layout_parser=True, + use_coref=True, + use_ocr=True, + debug=False, + with_molscribe=True, + with_table=True, + with_ocr=True, + with_html=False, + with_expand_mol=False, + return_realative_coordinates=True, + quick_prediction=False, + mode='auto', + osd_detect=False, + return_table_html=False + ) + + # Parse the result based on return type + if with_table_detect: + # Returns (total_result_dict, total_table_result_dict) + if isinstance(prediction_result, tuple) and len(prediction_result) >= 2: + total_result_dict, total_table_result_dict = prediction_result[:2] + else: + total_result_dict = prediction_result + total_table_result_dict = {} + else: + # Returns just total_result_dict + total_result_dict = prediction_result + total_table_result_dict = {} + + # Process molecule results + molecules = [] + page_key = page_idx + 1 # _prediction_from_pdf uses 1-based page indices + if page_key in total_result_dict: + for mol_result in total_result_dict[page_key]: + if 'mol_box' in mol_result: + mol_box = mol_result['mol_box'] + # print('mol_result', mol_result, flush=True) + # Convert tuple to list format + bbox = [mol_box[0], mol_box[1], mol_box[2], mol_box[3]] + print('bbbbbbbbbbbox', bbox, flush=True) + + # Convert relative coordinates to absolute coordinates + page_width = page.polygon.width + page_height = page.polygon.height + absolute_bbox = [ + bbox[0] * page_width, # x1 * width + bbox[1] * page_height, # y1 * height + bbox[2] * page_width, # x2 * width + bbox[3] * page_height # y2 * height + ] + print(f'absolute_bbox: {absolute_bbox}, page_size: {page_width}x{page_height}', flush=True) + + # Extract additional data from _prediction_from_pdf result + smiles = mol_result.get('post_SMILES', mol_result.get('Cano_SMILES', 'detected_molecule')) + + molecules.append({ + 'bbox': absolute_bbox, + 'confidence': 0.9, # Default confidence + 'data': { + 'page_idx': page_idx, + 'bbox': bbox, + 'label_box': mol_result.get('label_box_list', []), + 'label': '/'.join(mol_result.get('label_string', [])), + 'smiles': smiles, + 'mol_block': mol_result.get('post_molblock', ''), + 'assigned_idx': mol_result.get('assigned_idx', ''), + 'state': mol_result.get('state', 'unknown'), + 'mock': False + } + }) + + # Process table results + tables = [] + if with_table_detect and page_key in total_table_result_dict: + for table_result in total_table_result_dict[page_key]: + if table_result: + # Extract HTML content if available + html_content = table_result.get('html', '
Molecular Data Table
') + # 如果没有smiles + if "Cano_SMILES" not in html_content: + continue + print('table_result', table_result, table_result['box'], flush=True) + ori_bbox = table_result['box'] + bbox = [ori_bbox[0], ori_bbox[1], ori_bbox[2], ori_bbox[3]] + print('ccccccccccbox', bbox, flush=True) + + # Compare with page dimensions to see if this is already absolute + page_width = page.polygon.width + page_height = page.polygon.height + absolute_bbox = [ + bbox[0] * page_width, # x1 * width + bbox[1] * page_height, # y1 * height + bbox[2] * page_width, # x2 * width + bbox[3] * page_height # y2 * height + ] + print(f'table_bbox: {bbox}, page_size: {page_width}x{page_height}, bbox_range: x=[{bbox[0]}-{bbox[2]}], y=[{bbox[1]}-{bbox[3]}]', flush=True) + + tables.append({ + 'bbox': absolute_bbox, + 'confidence': table_result.get('confidence', 0.9), + 'data': { + 'bbox': bbox, + 'page_idx': page_idx, + 'table_type': 'molecule_table', + 'html_content': html_content, + 'dataframe': table_result.get('dataframe', None), + 'has_Rgroup': table_result.get('has_Rgroup', False), + 'format': 'html', + 'mock': False, + 'source': 'prediction_from_pdf' + } + }) + + else: + molecules = [] + tables = [] + + results.append({ + 'page_idx': page_idx, + 'molecules': molecules, + 'tables': tables + }) + + print(f"📄 页面 {page_idx + 1}: 检测到 {len(molecules)} 个分子, {len(tables)} 个分子表格") + + except Exception as e: + traceback.print_exc() + print(f"Error detecting molecules/tables on page {page_idx}: {e}") + results.append({'page_idx': page_idx, 'molecules': [], 'tables': []}) + + return results + + def _bbox_to_polygon(self, bbox): + """ + 将bbox转换为polygon格式 + bbox格式: [x1, y1, x2, y2] + polygon格式: [[x1,y1], [x2,y1], [x2,y2], [x1,y2]] + """ + + return PolygonBox.from_bbox(bbox) + + def merge_molecule_blocks_to_pages(self, pages: List[PageGroup], detection_results: List[dict]): + """ + Merge detected molecules and tables into page structures + + Args: + pages: List of page groups to modify + detection_results: Detection results from img2mol or mock data + """ + for page_result in detection_results: + page_idx = page_result.get('page_idx', 0) + if page_idx >= len(pages): + continue + + page = pages[page_idx] + new_blocks = [] + + # Process molecule detections + for molecule_detection in page_result.get('molecules', []): + bbox = molecule_detection.get('bbox', []) + if len(bbox) != 4: + continue + + polygon = self._bbox_to_polygon(bbox) + + if self.use_mock_data: + # Mock数据 + structure_data = { + 'smiles': 'c1ccccc1', + 'formula': 'C6H6', + 'mock': True + } + else: + # 真实数据 + structure_data = molecule_detection.get('data', {}) + + # Create molecule block with proper page_id + mol_block = Molecule( + polygon=polygon, + page_id=page.page_id, + structure_data=structure_data, + confidence=molecule_detection.get('confidence', 1.0) + ) + new_blocks.append(mol_block) + + # Process table detections + for table_detection in page_result.get('tables', []): + bbox = table_detection.get('bbox', []) + if len(bbox) != 4: + continue + + polygon = self._bbox_to_polygon(bbox) + table_data = table_detection.get('data', {}) + + # 获取HTML内容 + html_content = table_data.get('html_content', '') + + # 调试信息 + source = table_data.get('source', 'unknown') + original_bbox = table_data.get('original_bbox') + if original_bbox: + print(f"🔄 基于已有表格生成分子表格: 原始坐标 {original_bbox} -> 调整后坐标 {bbox}") + + # Create molecule table block with proper page_id + mol_table_block = MoleculeTable( + polygon=polygon, + page_id=page.page_id, + structure_data={'page_idx': page_idx, 'bbox': bbox, 'html_content': html_content}, + html=html_content, # 直接使用html字段 + confidence=table_detection.get('confidence', 1.0) + ) + new_blocks.append(mol_table_block) + + if new_blocks: + # Replace overlapping blocks for molecules (any block type with high overlap) + molecule_blocks = [b for b in new_blocks if isinstance(b, Molecule)] + if molecule_blocks: + self._replace_overlapping_blocks( + page, + molecule_blocks, + self.overlap_threshold, + target_types=[BlockTypes.Figure, BlockTypes.Picture] + ) + + # Replace overlapping blocks for tables (specifically target Table blocks) + table_blocks = [b for b in new_blocks if isinstance(b, MoleculeTable)] + if table_blocks: + self._replace_overlapping_blocks( + page, + table_blocks, + self.table_overlap_threshold, + target_types=[BlockTypes.Table] + ) + + def _replace_overlapping_blocks(self, page: PageGroup, new_blocks: List, + threshold: float, exclude_types: List = None, + target_types: List = None): + """ + Replace overlapping blocks with new molecule/table blocks + + New logic: If any new_block overlaps with an existing_block above threshold, + the existing_block will be removed. All new_blocks will be added. + This handles cases where multiple molecules are within one figure. + + Args: + page: The page containing blocks to check + new_blocks: List of new blocks to add + threshold: Overlap threshold (0-1) + exclude_types: Block types to exclude from replacement + target_types: Only replace blocks of these types (if specified) + """ + if not new_blocks: + return + + if exclude_types is None: + exclude_types = [] + + blocks_to_remove = [] # existing blocks to remove + blocks_to_add = new_blocks # all new blocks will be added + + # First, identify all existing blocks that should be removed + for existing_block in page.current_children: # Use current_children to get non-removed blocks + # Skip if block type is excluded + if existing_block.block_type in exclude_types: + continue + + # If target_types specified, only consider those types + if target_types and existing_block.block_type not in target_types: + continue + + # Check if this existing block overlaps with any new block above threshold + should_remove = False + for new_block in new_blocks: + # Calculate overlap percentage (intersection area / new_block area) + overlap_pct = new_block.polygon.intersection_pct(existing_block.polygon) + print(f'overlap_pct: {overlap_pct:.3f} (intersection/new_block), existing_block: {existing_block.block_type}', flush=True) + + if overlap_pct >= threshold: + should_remove = True + print(f'🗑️ Will remove existing {existing_block.block_type} due to overlap {overlap_pct:.3f} with new molecule', flush=True) + break # No need to check other new blocks for this existing block + + if should_remove and existing_block not in blocks_to_remove: + blocks_to_remove.append(existing_block) + + print(f'📊 Summary: Removing {len(blocks_to_remove)} existing blocks, Adding {len(blocks_to_add)} new blocks', flush=True) + + # Execute the operations + self._execute_block_operations_v2(page, blocks_to_remove, blocks_to_add) + + def _execute_block_operations(self, page: PageGroup, blocks_to_replace: List, blocks_to_add: List): + """ + Execute block replacement and addition operations using proper page methods + + Args: + page: The page to modify + blocks_to_replace: List of (old_block, new_block) tuples + blocks_to_add: List of new blocks to add + """ + # Replace existing blocks + for old_block, new_block in blocks_to_replace: + # Set proper page_id for the new block + new_block.page_id = page.page_id + page.replace_block(old_block, new_block) + + # Add new blocks + for block_to_add in blocks_to_add: + # Set proper page_id for the new block + block_to_add.page_id = page.page_id + page.add_full_block(block_to_add) + # Also add to page structure for proper ordering + page.structure.append(block_to_add.id) + + def _execute_block_operations_v2(self, page: PageGroup, blocks_to_remove: List, blocks_to_add: List): + """ + Execute block removal and addition operations using proper page methods + Maintains correct rendering order by inserting new blocks at removed blocks' positions + + Args: + page: The page to modify + blocks_to_remove: List of existing blocks to remove + blocks_to_add: List of new blocks to add + """ + if not blocks_to_remove and not blocks_to_add: + return + + # Step 1: Record positions of blocks to be removed + removal_positions = {} # block_id -> position in structure + if page.structure: + for i, block_id in enumerate(page.structure): + for block_to_remove in blocks_to_remove: + if block_id == block_to_remove.id: + removal_positions[block_to_remove.id] = i + break + + # Step 2: Remove existing blocks by marking them as removed + for block_to_remove in blocks_to_remove: + print(f'🔥 Removing existing block: {block_to_remove.block_type} at {block_to_remove.polygon.bbox}', flush=True) + block_to_remove.removed = True + + # Step 3: Add new blocks and update structure with correct positioning + for block_to_add in blocks_to_add: + print(f'✅ Adding new block: {block_to_add.block_type} at {block_to_add.polygon.bbox}', flush=True) + # Set proper page_id for the new block + block_to_add.page_id = page.page_id + page.add_full_block(block_to_add) + + # Step 4: Rebuild page structure with correct ordering + if page.structure and removal_positions: + # Find the earliest position where a block was removed + earliest_position = min(removal_positions.values()) + print(f'📍 Inserting new blocks at position {earliest_position} (where removed blocks were)', flush=True) + + # Remove all removed block IDs from structure + original_structure = page.structure[:] + page.structure = [block_id for block_id in page.structure + if not any(block_id == removed_block.id for removed_block in blocks_to_remove)] + + # Insert new block IDs at the earliest removal position + new_block_ids = [block.id for block in blocks_to_add] + page.structure[earliest_position:earliest_position] = new_block_ids + + print(f'🔄 Structure updated: {len(original_structure)} -> {len(page.structure)} blocks', flush=True) + elif page.structure: + # Fallback: append to end if no removal positions found + for block_to_add in blocks_to_add: + page.structure.append(block_to_add.id) diff --git a/marker/config/parser.py b/marker/config/parser.py index 676de7f2..a26d43b9 100644 --- a/marker/config/parser.py +++ b/marker/config/parser.py @@ -51,6 +51,7 @@ def generate_config_dict(self) -> Dict[str, any]: config = {} output_dir = self.cli_options.get("output_dir", settings.OUTPUT_DIR) for k, v in self.cli_options.items(): + print('k', k, 'v', v) if not v: continue @@ -65,8 +66,9 @@ def generate_config_dict(self) -> Dict[str, any]: case "languages": config["languages"] = v.split(",") case "config_json": - with open(v, "r", encoding="utf-8") as f: - config.update(json.load(f)) + # with open(v, "r", encoding="utf-8") as f: + # config.update(json.load(f)) + config.update(v) case "disable_multiprocessing": config["pdftext_workers"] = 1 case "disable_image_extraction": diff --git a/marker/converters/pdf.py b/marker/converters/pdf.py index ccd03f03..aec43582 100644 --- a/marker/converters/pdf.py +++ b/marker/converters/pdf.py @@ -10,6 +10,7 @@ from marker.builders.document import DocumentBuilder from marker.builders.layout import LayoutBuilder from marker.builders.llm_layout import LLMLayoutBuilder +from marker.builders.molecule_layout import MoleculeLayoutBuilder from marker.builders.line import LineBuilder from marker.builders.ocr import OcrBuilder from marker.builders.structure import StructureBuilder @@ -45,22 +46,28 @@ from marker.processors.line_merge import LineMergeProcessor from marker.processors.llm.llm_mathblock import LLMMathBlockProcessor +from datetime import datetime +from marker.utils import send_callback, flush_cuda_memory +from marker import settings +import pytz +import traceback +# 获取北京时区 +beijing_tz = pytz.timezone('Asia/Shanghai') + class PdfConverter(BaseConverter): """ A converter for processing and rendering PDF files into Markdown, JSON, HTML and other formats. """ - override_map: Annotated[ - Dict[BlockTypes, Type[Block]], - "A mapping to override the default block classes for specific block types.", - "The keys are `BlockTypes` enum values, representing the types of blocks,", - "and the values are corresponding `Block` class implementations to use", - "instead of the defaults." - ] = defaultdict() - use_llm: Annotated[ - bool, - "Enable higher quality processing with LLMs.", - ] = False + # A mapping to override the default block classes for specific block types + override_map: Dict[BlockTypes, Type[Block]] = defaultdict() + + # Enable higher quality processing with LLMs + use_llm: bool = False + + # Enable chemical molecule detection using specialized layout model + use_molecule_detection: bool = False + default_processors: Tuple[BaseProcessor, ...] = ( OrderProcessor, LineMergeProcessor, @@ -95,7 +102,10 @@ def __init__( processor_list: Optional[List[str]] = None, renderer: str | None = None, llm_service: str | None = None, - config=None + config=None, + callback_url: str | None = None, + docId: str | None = None, + mol_detect: bool = False ): super().__init__(config) @@ -134,23 +144,94 @@ def __init__( self.layout_builder_class = LayoutBuilder if self.use_llm: self.layout_builder_class = LLMLayoutBuilder - - def build_document(self, filepath: str): - provider_cls = provider_from_filepath(filepath) + + # 分子layout builder + self.molecule_layout_builder = None + self.use_molecule_detection = mol_detect + if config.get("use_molecule_detection", False) or self.use_molecule_detection: + # img2mol processor的配置 + processor_config = artifact_dict.get("processor_config", {}) + + # 设置默认配置 + if not processor_config.get("device"): + processor_config["device"] = "cuda" + + try: + self.molecule_layout_builder = self.resolve_dependencies( + MoleculeLayoutBuilder + ) + except Exception as e: + traceback.print_exc() + print(f"Warning: Failed to initialize MoleculeLayoutBuilder: {e}") + self.molecule_layout_builder = None + + self.callback_url = callback_url + self.docId = docId + + def build_document(self, filepath: str, file_type: str='pdf'): + provider_cls = provider_from_filepath(filepath, file_type=file_type) layout_builder = self.resolve_dependencies(self.layout_builder_class) line_builder = self.resolve_dependencies(LineBuilder) ocr_builder = self.resolve_dependencies(OcrBuilder) - provider = provider_cls(filepath, self.config) - document = DocumentBuilder(self.config)(provider, layout_builder, line_builder, ocr_builder) + + # 如果启用了分子检测,则设置flatten_pdf为False + provider_config = self.config.copy() if self.config else {} + if self.use_molecule_detection: + provider_config['flatten_pdf'] = False + print(f"[MoleculeDetection] Setting flatten_pdf=False due to molecule detection enabled") + + provider = provider_cls(filepath, provider_config) + document = DocumentBuilder(self.config)( + provider, + layout_builder, + line_builder, + ocr_builder, + callback_url=self.callback_url, + docId=self.docId, + second_layout_builder=self.molecule_layout_builder + ) + + # 在文档构建完成后清理分子检测相关的内存 + if self.use_molecule_detection and self.molecule_layout_builder: + try: + self.molecule_layout_builder.cleanup_memory() + print("✅ Cleaned up molecule detection memory after document building") + except Exception as e: + print(f"⚠️ Warning during molecule layout builder cleanup: {e}") + + flush_cuda_memory() + time_str = datetime.now(beijing_tz).strftime("%H:%M:%S") + send_callback(self.callback_url, { + 'status': True, + 'messages': 'success', + 'docId': self.docId, + 'progress': 68, + 'progress_text': '完成OCR ' + time_str + }) + structure_builder_cls = self.resolve_dependencies(StructureBuilder) structure_builder_cls(document) + flush_cuda_memory() + time_str = datetime.now(beijing_tz).strftime("%H:%M:%S") + send_callback(self.callback_url, { + 'status': True, + 'messages': 'success', + 'docId': self.docId, + 'progress': 91, + 'progress_text': '完成文档重建 ' + time_str + }) + + # 移除 DebugProcessor 和 IgnoreTextProcessor + self.processor_list = [p for p in self.processor_list + if not isinstance(p, (DebugProcessor, IgnoreTextProcessor))] + for processor in self.processor_list: processor(document) return document - def __call__(self, filepath: str): - document = self.build_document(filepath) + def __call__(self, filepath: str, file_type: str='pdf'): + document = self.build_document(filepath, file_type=file_type) renderer = self.resolve_dependencies(self.renderer) return renderer(document) diff --git a/marker/docs/molecule_detection_integration.md b/marker/docs/molecule_detection_integration.md new file mode 100644 index 00000000..2b1d773b --- /dev/null +++ b/marker/docs/molecule_detection_integration.md @@ -0,0 +1,191 @@ +# 化学分子识别集成指南 + +本文档介绍如何在marker中集成img2mol进行化学分子和分子表格的识别。 + +## 概述 + +通过集成img2mol的`Parser_Processer`,marker现在可以: + +1. **检测化学分子结构** - 输出`...`标签 +2. **检测分子数据表格** - 输出`...`标签 +3. **智能替换原有内容** - 使用IOU>90%的阈值替换重叠的layout blocks + +## 系统架构 + +``` +PDF文档 → 标准Layout检测 → 分子Layout检测 → 结果合并 → 最终输出 + ↓ ↓ ↓ + 普通blocks 分子/表格blocks 智能替换 +``` + +## 新增组件 + +### 1. BlockTypes +- `Molecule`: 化学分子结构 +- `MoleculeTable`: 包含分子数据的表格 + +### 2. Block类 +- `Molecule`: 输出`...`标签 +- `MoleculeTable`: 输出`...`标签 + +### 3. MoleculeLayoutBuilder +集成img2mol的`Parser_Processer`,负责: +- 调用分子检测模型 +- 调用表格检测模型 +- 执行智能替换逻辑 + +## 配置参数 + +### processor_config +传入`Parser_Processer`的配置参数: + +```python +processor_config = { + "device": "cuda", # 设备: "cuda" 或 "cpu" + "with_mol_detect": True, # 启用分子检测 + "with_table_detect": True, # 启用表格检测 + "use_yolo_mol_model": True, # 使用YOLO分子模型 + "use_yolo_table_model": True, # 使用YOLO表格模型 + "use_yolo_table_model_v2": True, # 使用YOLO表格模型v2 + "debug": False, # 调试模式 + "num_workers": 1, # 工作进程数 + "padding": 0 # 图像填充 +} +``` + +### 替换阈值 +- `overlap_threshold`: 分子替换阈值,默认0.9 (90%) +- `table_overlap_threshold`: 表格替换阈值,默认0.9 (90%) + +## 使用方法 + +### 基本用法 + +```python +from marker.converters.pdf import PdfConverter + +# 配置分子识别 +processor_config = { + "device": "cuda", + "with_mol_detect": True, + "with_table_detect": True, +} + +artifact_dict = { + "processor_config": processor_config, +} + +config = { + "use_molecule_detection": True, +} + +# 创建转换器 +converter = PdfConverter( + artifact_dict=artifact_dict, + config=config +) + +# 处理PDF +result = converter("chemistry_paper.pdf") +``` + +### 高级配置 + +```python +# 自定义替换阈值 +processor_config = { + "device": "cuda", + "with_mol_detect": True, + "with_table_detect": True, + "debug": True, # 启用调试 +} + +# 可以通过config调整阈值(如果需要的话) +config = { + "use_molecule_detection": True, + "use_llm": True, # 同时启用LLM处理 +} +``` + +## 替换逻辑 + +### 分子替换 +- 检测到的分子区域与**任何类型**的原有block重叠度>90%时进行替换 +- 保持原有的页面结构顺序 + +### 表格替换 +- 检测到的分子表格**只替换原有的Table类型**的block +- IOU>90%时进行替换 +- 其他类型的block不会被表格替换 + +### 替换过程 +1. 计算新检测block与现有block的重叠度 +2. 标记需要替换的原有blocks +3. 在原位置插入新的分子/表格blocks +4. 从页面结构中移除被替换的blocks + +## 输出格式 + +### 分子结构 +```html + + + +``` + +### 分子表格 +```html + + + +``` + +## 依赖要求 + +确保安装了img2mol的相关依赖: + +```bash +# img2mol相关依赖 +pip install torch torchvision +pip install ultralytics # YOLO模型 +pip install rdkit-pypi # 化学计算 +# ... 其他img2mol依赖 +``` + +## 故障排除 + +### 常见问题 + +1. **ImportError: img2mol.processor not found** + - 确保img2mol在Python路径中 + - 检查img2mol/processor.py文件存在 + +2. **CUDA内存不足** + - 设置`device: "cpu"`使用CPU + - 减少batch_size + +3. **模型文件缺失** + - 确保YOLO模型文件路径正确 + - 检查img2mol的模型配置 + +### 调试建议 + +1. 启用调试模式:`"debug": True` +2. 检查processor初始化日志 +3. 验证输入图像格式 + +## 性能优化 + +1. **GPU使用**: 确保CUDA可用时使用GPU +2. **批处理**: 调整batch_size以平衡速度和内存 +3. **模型缓存**: 预加载模型以避免重复初始化 + +## 扩展说明 + +如果需要进一步的分子内容识别(如SMILES生成),可以: + +1. 在`Molecule`和`MoleculeTable`类中添加额外的处理方法 +2. 创建专门的分子处理processor +3. 在后处理阶段进行分子结构识别 + +该集成框架为化学文档处理提供了强大的基础设施。 \ No newline at end of file diff --git a/marker/examples/molecule_detection_example.py b/marker/examples/molecule_detection_example.py new file mode 100644 index 00000000..332a3131 --- /dev/null +++ b/marker/examples/molecule_detection_example.py @@ -0,0 +1,188 @@ +#!/usr/bin/env python3 +""" +分子识别集成示例 + +展示如何使用marker集成img2mol进行化学分子和表格识别 +""" + +import os +import sys +from typing import Dict, Any + +# 添加marker路径 +sys.path.append(os.path.dirname(os.path.dirname(__file__))) + +from marker.converters.pdf import PdfConverter +from marker.settings import settings + + +def create_molecule_detection_converter( + processor_config: Dict[str, Any] = None, + use_gpu: bool = True, + model_paths: Dict[str, str] = None +) -> PdfConverter: + """ + 创建支持分子识别的PDF转换器 + + Args: + processor_config: img2mol processor的配置参数 + use_gpu: 是否使用GPU + model_paths: 模型文件路径配置 + + Returns: + 配置好的PdfConverter实例 + """ + + # 默认的processor配置 + default_processor_config = { + "device": "cuda" if use_gpu and settings.TORCH_DEVICE_MODEL == "cuda" else "cpu", + "with_mol_detect": True, + "with_table_detect": True, + "use_yolo_mol_model": True, + "use_yolo_table_model": True, + "use_yolo_table_model_v2": True, + "debug": False, + "num_workers": 1, + "padding": 0 + } + + # 如果提供了模型路径,添加到配置中 + if model_paths: + if "mol_model_path" in model_paths: + default_processor_config["MolDetect_mol_path"] = model_paths["mol_model_path"] + if "table_model_path" in model_paths: + default_processor_config["td_model_path"] = model_paths["table_model_path"] + + if processor_config: + default_processor_config.update(processor_config) + + # artifact_dict配置 + artifact_dict = { + "processor_config": default_processor_config, + } + + # converter配置 + config = { + "use_molecule_detection": True, # 启用分子检测 + "use_llm": False, # 根据需要启用LLM + } + + # 创建converter + converter = PdfConverter( + artifact_dict=artifact_dict, + config=config + ) + + return converter + + +def process_pdf_with_molecule_detection( + pdf_path: str, + output_path: str = None, + processor_config: Dict[str, Any] = None, + model_paths: Dict[str, str] = None +): + """ + 处理包含化学分子的PDF文件 + + Args: + pdf_path: PDF文件路径 + output_path: 输出文件路径,如果为None则使用默认路径 + processor_config: processor配置 + model_paths: 模型文件路径配置 + """ + + if not os.path.exists(pdf_path): + raise FileNotFoundError(f"PDF file not found: {pdf_path}") + + # 创建converter + converter = create_molecule_detection_converter( + processor_config=processor_config, + model_paths=model_paths + ) + + try: + print(f"开始处理PDF: {pdf_path}") + print("正在进行分子和表格识别...") + + # 处理PDF + result = converter(pdf_path) + + # 保存结果 + if output_path is None: + base_name = os.path.splitext(os.path.basename(pdf_path))[0] + output_path = f"{base_name}_with_molecules.md" + + with open(output_path, 'w', encoding='utf-8') as f: + f.write(result) + + print(f"处理完成!结果已保存到: {output_path}") + + # 统计结果 + mol_count = result.count('') + mol_table_count = result.count('') + print(f"检测到 {mol_count} 个分子结构") + print(f"检测到 {mol_table_count} 个分子表格") + + return result + + except Exception as e: + print(f"处理PDF时出错: {e}") + raise + + +def main(): + """主函数 - 示例用法""" + + # 模型文件路径配置(请根据实际情况修改) + model_paths = { + # "mol_model_path": "/path/to/your/mol_detection_model", + # "table_model_path": "/path/to/your/table_detection_model", + } + + # 示例配置 + processor_config = { + "device": "cuda", # 或 "cpu" + "debug": True, # 启用调试模式 + "with_mol_detect": True, + "with_table_detect": True, + # 可以添加更多配置项 + "use_yolo_mol_model": True, + "use_yolo_table_model": True, + "use_yolo_table_model_v2": True, + } + + # 示例PDF路径(请替换为实际路径) + pdf_path = "example_chemistry_paper.pdf" + + if os.path.exists(pdf_path): + try: + result = process_pdf_with_molecule_detection( + pdf_path=pdf_path, + processor_config=processor_config, + model_paths=model_paths + ) + + print("\n分子识别成功完成!") + print("输出中包含以下标签:") + print("- ...: 化学分子结构") + print("- ...: 包含分子数据的表格") + print("\n特点:") + print("- IOU>90%的分子表格会替换原有的普通表格") + print("- 保持原有的文档结构和顺序") + print("- 支持各种分子结构和表格格式") + + except Exception as e: + print(f"错误: {e}") + else: + print(f"示例PDF文件不存在: {pdf_path}") + print("请将此脚本中的pdf_path变量设置为实际的PDF文件路径") + print("\n使用方法:") + print("1. 确保安装了img2mol及其依赖") + print("2. 配置模型文件路径(如果需要)") + print("3. 设置正确的PDF文件路径") + print("4. 运行脚本") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/marker/examples/real_molecule_example.py b/marker/examples/real_molecule_example.py new file mode 100644 index 00000000..5f62fbd0 --- /dev/null +++ b/marker/examples/real_molecule_example.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python3 +""" +真实img2mol集成使用示例 +展示如何正确配置和使用真实的分子识别功能 +""" + +import os +import sys +from pathlib import Path + +# 添加marker路径 +current_dir = Path(__file__).parent.parent.parent +sys.path.append(str(current_dir)) + +from marker.main import marker_main +from marker.config.parser import ConfigParser +from marker.config.processor import apply_config + +def create_real_molecule_config(): + """创建真实img2mol处理的配置""" + + # img2mol处理器配置 + processor_config = { + # 基础设置 + 'device': 'cuda' if torch.cuda.is_available() else 'cpu', # 优先使用GPU + 'debug': True, # 开启调试模式 + 'num_workers': 1, # 工作线程数 + 'padding': 0, # 图像填充 + + # 分子检测设置 + 'with_mol_detect': True, # 启用分子检测 + 'use_yolo_mol_model': True, # 使用YOLO分子模型 + 'new_class_token': True, # 使用新的类标记 + + # 表格检测设置 + 'with_table_detect': True, # 启用表格检测 + 'use_yolo_table_model': True, # 使用YOLO表格模型 + 'use_yolo_table_model_v2': True, # 使用YOLO表格模型v2 + + # OCR设置 + 'use_trocr_mfr_model_v3': True, # 使用TrOCR数学公式识别模型v3 + 'use_got_ocr_model': True, # 使用GOT OCR模型 + 'with_ocr': False, # 在分子检测时不使用OCR + + # 模型预加载设置(可选,影响启动速度) + 'preload_table_and_ocr_model': True, # 预加载表格和OCR模型 + + # 高级设置 + 'use_tta': True, # 使用测试时间增强 + 'coref': True, # 使用共指消解 + 'with_padding': False, # 不使用图像填充 + + # 模型路径(如果需要指定特定模型) + # 'MolDetect_mol_path': '/path/to/mol/model', + # 'model_dir': '/path/to/models' + } + + # marker主要配置 + config = { + # 启用分子检测 + 'use_molecule_detection': True, + + # 传递处理器配置 + 'processor_config': processor_config, + + # 其他marker配置 + 'max_pages': None, # 处理所有页面 + 'start_page': None, # 从第一页开始 + 'languages': ['en'], # 支持的语言 + 'batch_multiplier': 1, # 批处理倍数 + 'ocr_all_pages': False, # 不对所有页面进行OCR + + # 输出格式 + 'output_format': 'markdown', # 输出格式为markdown + 'extract_images': True, # 提取图像 + } + + return config + +def process_chemical_pdf(pdf_path, output_dir=None): + """ + 处理化学PDF文档,检测分子和表格 + + Args: + pdf_path: PDF文件路径 + output_dir: 输出目录,如果为None则使用默认目录 + + Returns: + 处理结果的路径 + """ + + # 验证输入文件 + pdf_path = Path(pdf_path) + if not pdf_path.exists(): + raise FileNotFoundError(f"PDF文件不存在: {pdf_path}") + + # 设置输出目录 + if output_dir is None: + output_dir = pdf_path.parent / f"{pdf_path.stem}_molecule_output" + else: + output_dir = Path(output_dir) + + output_dir.mkdir(exist_ok=True) + + print(f"🧬 开始处理化学PDF: {pdf_path}") + print(f"📁 输出目录: {output_dir}") + + # 创建配置 + config = create_real_molecule_config() + + try: + # 调用marker主函数 + result = marker_main( + pdf_path=str(pdf_path), + output_dir=str(output_dir), + config=config, + artifact_dict={ + 'processor_config': config['processor_config'] # 确保传递处理器配置 + } + ) + + print(f"✅ 处理完成!") + print(f"📄 输出文件: {result}") + + return result + + except Exception as e: + print(f"❌ 处理失败: {e}") + import traceback + traceback.print_exc() + return None + +def main(): + """主函数""" + import argparse + + parser = argparse.ArgumentParser(description='使用真实img2mol处理化学PDF文档') + parser.add_argument('pdf_path', help='输入PDF文件路径') + parser.add_argument('--output-dir', '-o', help='输出目录路径') + parser.add_argument('--mock', action='store_true', help='使用mock模式(测试用)') + + args = parser.parse_args() + + # 如果使用mock模式,修改配置 + if args.mock: + print("🎭 使用Mock模式进行测试") + config = create_real_molecule_config() + config['processor_config']['use_mock_data'] = True + + # 处理PDF + result = process_chemical_pdf(args.pdf_path, args.output_dir) + + if result: + print(f"\n🎉 处理成功完成!") + print(f"📋 结果文件: {result}") + print(f"\n💡 提示: 查看输出的markdown文件,其中包含:") + print(f" • ... 标签标记的分子结构") + print(f" • ... 标签标记的分子表格") + else: + print(f"\n❌ 处理失败,请检查错误消息") + sys.exit(1) + +if __name__ == "__main__": + # 导入必要的模块 + try: + import torch + except ImportError: + print("Warning: PyTorch not available, defaulting to CPU") + torch = None + + main() \ No newline at end of file diff --git a/marker/oss_uploader.py b/marker/oss_uploader.py new file mode 100644 index 00000000..6d238a68 --- /dev/null +++ b/marker/oss_uploader.py @@ -0,0 +1,82 @@ + +import os +import boto3 +from botocore.config import Config +import string +import random +from datetime import datetime +from pathlib import Path +import traceback +from typing import Union +from dotenv import load_dotenv + +load_dotenv(os.path.join(os.path.dirname(__file__), '..', '.env')) + + +class S3Client: + def __init__(self, + endpoint_url=os.getenv("AWS_S3_ENDPOINT_URL"), + aws_access_key_id=os.getenv("AWS_S3_ACCESS_KEY_ID"), + aws_secret_access_key=os.getenv("AWS_S3_SECRET_ACCESS_KEY"), + region_name=os.getenv("AWS_S3_REGION_NAME"), + bucket=os.getenv("AWS_STORAGE_BUCKET_NAME") + ): + self.connection_data = { + 'endpoint_url': endpoint_url, + 'aws_access_key_id': aws_access_key_id, + 'aws_secret_access_key': aws_secret_access_key, + 'region_name': region_name, + } + self.bucket = bucket + self.s3_session = boto3.client('s3', config=Config(s3={'addressing_style': 'virtual'}), **self.connection_data) + + @staticmethod + def random_str(): + alphabet = string.ascii_lowercase + string.digits + return ''.join(random.choices(alphabet, k=6)) + + def s3_upload_from_file(self, filename: str, filebytes: bytes): + p = Path(filename) + date = datetime.now().strftime('%Y%m%d') + key = f'qilu-brain/wemol/{date}/{p.stem}_qilu-brain_{self.random_str()}{p.suffix}' + return self._s3_upload(filebytes, key) + + def download_from_s3(self, key: str): + try: + response = self.s3_session.get_object(Bucket=self.bucket, Key=key) + # Extract and read the file content from the streaming body + content = response['Body'].read() + return content + except Exception as e: + traceback.print_exc() + print(f'download from s3 failed, cause {str(e)}') + return None + + def delete_from_s3(self, key: str): + try: + self.s3_session.delete_object(Bucket=self.bucket, Key=key) + return True + except Exception as e: + traceback.print_exc() + print(f'delete from s3 failed, cause {str(e)}') + return False + + def _s3_upload(self, body: Union[str, bytes], key: str): + try: + _ = self.s3_session.put_object(Body=body, Bucket=self.bucket, Key=key) + # 生成url,不过期的 + url = self.s3_session.generate_presigned_url( + ClientMethod='get_object', + Params={'Bucket': self.bucket, 'Key': key}, + ExpiresIn=None, + HttpMethod='GET' + ) + return { + 'url': url, + 'key': key + } + + except Exception as e: + traceback.print_exc() + print(f'upload object failed, cause {str(e)}') + return {} diff --git a/marker/providers/__init__.py b/marker/providers/__init__.py index b8c48373..a89ced82 100644 --- a/marker/providers/__init__.py +++ b/marker/providers/__init__.py @@ -75,19 +75,40 @@ def __enter__(self): def get_font_css(): from weasyprint import CSS from weasyprint.text.fonts import FontConfiguration + import os font_config = FontConfiguration() - css = CSS(string=f''' - @font-face {{ - font-family: GoNotoCurrent-Regular; - src: url({settings.FONT_PATH}); - font-display: swap; - }} + + # Check if font file exists and create proper file URI + font_src = "" + if os.path.exists(settings.FONT_PATH): + # Convert to absolute path and use file:// URI scheme + abs_font_path = os.path.abspath(settings.FONT_PATH) + font_src = f"url(file://{abs_font_path})" + + # Create CSS with proper font fallback + css_content = f''' body {{ - font-family: {settings.FONT_NAME.split(".")[0]}, sans-serif; + font-family: "DejaVu Sans", "Liberation Sans", Arial, sans-serif; font-variant-ligatures: none; font-feature-settings: "liga" 0; - text-rendering: optimizeLegibility; }} - ''', font_config=font_config) + ''' + + # Only add @font-face if font file exists + if font_src: + css_content = f''' + @font-face {{ + font-family: GoNotoCurrent-Regular; + src: {font_src}; + font-display: swap; + }} + body {{ + font-family: GoNotoCurrent-Regular, "DejaVu Sans", "Liberation Sans", Arial, sans-serif; + font-variant-ligatures: none; + font-feature-settings: "liga" 0; + }} + ''' + + css = CSS(string=css_content, font_config=font_config) return css diff --git a/marker/providers/document.py b/marker/providers/document.py index 4dfd1038..61a33c58 100644 --- a/marker/providers/document.py +++ b/marker/providers/document.py @@ -23,7 +23,7 @@ div, p { max-width: 100%; - word-break: break-word; + overflow-wrap: break-word; font-size: 10pt; } @@ -62,23 +62,165 @@ def __init__(self, filepath: str, config=None): super().__init__(self.temp_pdf_path, config) def __del__(self): - if os.path.exists(self.temp_pdf_path): - os.remove(self.temp_pdf_path) + # if os.path.exists(self.temp_pdf_path): + # os.remove(self.temp_pdf_path) + pass + + def _convert_image_to_base64(self, image): + """Convert mammoth image to base64 data URI""" + try: + with image.open() as image_bytes: + import base64 + return "data:" + image.content_type + ";base64," + base64.b64encode(image_bytes.read()).decode() + except Exception as e: + print(f"Failed to convert image: {e}") + return "" def convert_docx_to_pdf(self, filepath: str): from weasyprint import CSS, HTML import mammoth + import re with open(filepath, "rb") as docx_file: - # we convert the docx to HTML - result = mammoth.convert_to_html(docx_file) + # Configure style mapping to preserve heading levels + style_map = """ + p[style-name='Heading 1'] => h1:fresh + p[style-name='Heading 2'] => h2:fresh + p[style-name='Heading 3'] => h3:fresh + p[style-name='Heading 4'] => h4:fresh + p[style-name='Heading 5'] => h5:fresh + p[style-name='Heading 6'] => h6:fresh + p[style-name='标题 1'] => h1:fresh + p[style-name='标题 2'] => h2:fresh + p[style-name='标题 3'] => h3:fresh + p[style-name='标题 4'] => h4:fresh + p[style-name='标题 5'] => h5:fresh + p[style-name='标题 6'] => h6:fresh + p:empty => p:fresh + """ + + # Configure mammoth options for better conversion + convert_options = { + "convert_image": mammoth.images.img_element(lambda image: { + "src": self._convert_image_to_base64(image) + }), + "ignore_empty_paragraphs": False, + "style_map": style_map + } + + # we convert the docx to HTML with better options + result = mammoth.convert_to_html(docx_file, **convert_options) html = result.value + + # Print conversion messages if any + if result.messages: + print(f"Mammoth conversion messages: {result.messages}", flush=True) + + # Post-process HTML to normalize headings and preserve empty lines + html = self._normalize_html(html) + + # Debug: Print HTML content length and preview + print(f"Generated HTML length: {len(html)} characters", flush=True) + if html: + preview = html[:500].replace('\n', ' ') + print(f"HTML preview: {preview}...", flush=True) + else: + print("WARNING: Generated HTML is empty!", flush=True) # We convert the HTML into a PDF - HTML(string=self._preprocess_base64_images(html)).write_pdf( - self.temp_pdf_path, - stylesheets=[CSS(string=css), self.get_font_css()] - ) + processed_html = self._preprocess_base64_images(html) + if not processed_html.strip(): + print("ERROR: Processed HTML is empty, adding fallback content", flush=True) + processed_html = "

Document conversion failed - no content extracted

" + + # Wrap in proper HTML structure for better PDF conversion + if not processed_html.startswith('{processed_html}" + + print(f"Final HTML for PDF conversion (length: {len(processed_html)}):", flush=True) + print(f"HTML content: {processed_html[:1000]}...", flush=True) + + # Use improved CSS with better heading styling and spacing + simple_css = ''' + @page { + size: A4; + margin: 2cm; + } + body { + font-family: serif; + font-size: 12pt; + line-height: 1.6; + } + h1 { + font-size: 20pt; + font-weight: bold; + margin: 1.5em 0 1em 0; + line-height: 1.3; + } + h2 { + font-size: 16pt; + font-weight: bold; + margin: 1.2em 0 0.8em 0; + line-height: 1.4; + padding: 0.2em 0; + } + h3 { + font-size: 14pt; + font-weight: bold; + margin: 1em 0 0.6em 0; + line-height: 1.3; + } + h4, h5, h6 { + font-size: 13pt; + font-weight: bold; + margin: 0.8em 0 0.4em 0; + line-height: 1.3; + } + p { + margin: 0.5em 0; + font-family: serif; + line-height: 1.6; + } + p:empty { + margin: 0.5em 0; + min-height: 1em; + } + strong { + font-weight: bold; + } + ''' + + try: + print("Starting HTML to PDF conversion...", flush=True) + html_doc = HTML(string=processed_html) + print("HTML document created successfully", flush=True) + + html_doc.write_pdf( + self.temp_pdf_path, + stylesheets=[CSS(string=simple_css)] + ) + print(f"PDF conversion completed: {self.temp_pdf_path}", flush=True) + + # Check if PDF file was created and has content + import os + if os.path.exists(self.temp_pdf_path): + pdf_size = os.path.getsize(self.temp_pdf_path) + print(f"Generated PDF size: {pdf_size} bytes", flush=True) + else: + print("ERROR: PDF file was not created!", flush=True) + + except Exception as e: + print(f"ERROR during HTML to PDF conversion: {e}", flush=True) + import traceback + traceback.print_exc() + + # Create a minimal fallback PDF + fallback_html = f"

Document Conversion

Original content length: {len(html)} characters

Error: {str(e)}

" + try: + HTML(string=fallback_html).write_pdf(self.temp_pdf_path, stylesheets=[CSS(string=simple_css)]) + print("Created fallback PDF", flush=True) + except Exception as fallback_error: + print(f"Even fallback PDF creation failed: {fallback_error}", flush=True) @staticmethod def _preprocess_base64_images(html_content): @@ -100,3 +242,31 @@ def convert_image(match): return "" # we ditch broken images as that breaks the PDF creation down the line return re.sub(pattern, convert_image, html_content) + + def _normalize_html(self, html): + """Normalize HTML to ensure consistent heading levels and preserve empty lines""" + import re + + # Convert all h2 tags to have consistent styling (force them to be treated equally) + # This helps prevent marker from incorrectly assigning different levels + html = re.sub(r']*)>', r'

', html) + + # Preserve empty paragraphs by adding non-breaking space + html = re.sub(r'

', '

 

', html) + html = re.sub(r'

\s*

', '

 

', html) + + # Handle cases where mammoth might create empty paragraphs with just whitespace + html = re.sub(r'

(\s*)

', r'

 

', html) + + # Add proper spacing after headings + html = re.sub(r'', r'', html) + + # Ensure paragraphs have at least some content for proper rendering + html = re.sub(r'

\s*<\/p>', '

 

', html) + + # Add extra spacing for better readability + html = re.sub(r'

', r'\n

 

', html) + + print(f"Normalized HTML preview: {html[:800]}...", flush=True) + + return html diff --git a/marker/providers/pdf.py b/marker/providers/pdf.py index bba058d9..a2643530 100644 --- a/marker/providers/pdf.py +++ b/marker/providers/pdf.py @@ -43,7 +43,7 @@ class PdfProvider(BaseProvider): flatten_pdf: Annotated[ bool, "Whether to flatten the PDF structure.", - ] = True + ] = True # True force_ocr: Annotated[ bool, "Whether to force OCR on the whole document.", @@ -82,6 +82,10 @@ def __init__(self, filepath: str, config=None): self.filepath = filepath + # 如果config中指定了flatten_pdf,则使用config中的值覆盖默认值 + if config and 'flatten_pdf' in config: + self.flatten_pdf = config['flatten_pdf'] + with self.get_doc() as doc: self.page_count = len(doc) self.page_lines: ProviderPageLines = {i: [] for i in range(len(doc))} @@ -110,8 +114,9 @@ def get_doc(self): # Must be called on the parent pdf, before retrieving pages to render correctly if self.flatten_pdf: + print(f"[PdfProvider] flatten_pdf is True, init_forms", flush=True) doc.init_forms() - + print('@@@@doc', doc, dir(doc), flush=True) yield doc finally: if doc: @@ -397,6 +402,7 @@ def _render_image( page = pdf[idx] image = page.render(scale=dpi / 72, draw_annots=False).to_pil() image = image.convert("RGB") + print('@@@@@@@@@@image', image, image.size, flush=True) return image def get_images(self, idxs: List[int], dpi: int) -> List[Image.Image]: diff --git a/marker/providers/powerpoint.py b/marker/providers/powerpoint.py index 4c7e6987..8df8bf43 100644 --- a/marker/providers/powerpoint.py +++ b/marker/providers/powerpoint.py @@ -75,9 +75,14 @@ def convert_pptx_to_pdf(self, filepath): # Process shapes in the slide for shape in slide.shapes: # If shape is a group shape, we recursively handle all grouped shapes - if shape.shape_type == MSO_SHAPE_TYPE.GROUP: - html_parts.append(self._handle_group(shape)) - continue + try: + shape_type = shape.shape_type + except Exception as e: + print(traceback.format_exc()) + else: + if shape_type == MSO_SHAPE_TYPE.GROUP: + html_parts.append(self._handle_group(shape)) + continue # If shape is a table if shape.has_table: @@ -85,9 +90,14 @@ def convert_pptx_to_pdf(self, filepath): continue # If shape is a picture - if shape.shape_type == MSO_SHAPE_TYPE.PICTURE: - html_parts.append(self._handle_image(shape)) - continue + try: + shape_type = shape.shape_type + except Exception as e: + print(traceback.format_exc()) + else: + if shape_type == MSO_SHAPE_TYPE.PICTURE: + html_parts.append(self._handle_image(shape)) + continue # If shape has text if hasattr(shape, "text") and shape.text is not None: @@ -115,17 +125,27 @@ def _handle_group(self, group_shape) -> str: group_parts = [] for shape in group_shape.shapes: - if shape.shape_type == MSO_SHAPE_TYPE.GROUP: - group_parts.append(self._handle_group(shape)) - continue + try: + shape_type = shape.shape_type + except Exception as e: + print(traceback.format_exc()) + else: + if shape_type == MSO_SHAPE_TYPE.GROUP: + group_parts.append(self._handle_group(shape)) + continue if shape.has_table: group_parts.append(self._handle_table(shape)) continue - - if shape.shape_type == MSO_SHAPE_TYPE.PICTURE: - group_parts.append(self._handle_image(shape)) - continue + + try: + shape_type = shape.shape_type + except Exception as e: + print(traceback.format_exc()) + else: + if shape_type == MSO_SHAPE_TYPE.PICTURE: + group_parts.append(self._handle_image(shape)) + continue if hasattr(shape, "text"): if shape.has_text_frame: diff --git a/marker/providers/registry.py b/marker/providers/registry.py index 4a8e969e..23aee12c 100644 --- a/marker/providers/registry.py +++ b/marker/providers/registry.py @@ -56,7 +56,21 @@ def provider_from_ext(filepath: str): return PdfProvider -def provider_from_filepath(filepath: str): +def provider_from_filepath(filepath: str, file_type: str='pdf'): + if file_type == 'jpg' or file_type == 'png' or file_type == 'jpeg': + return ImageProvider + elif file_type == 'pdf': + return PdfProvider + elif file_type == 'docx' or file_type == 'doc': + return DocumentProvider + elif file_type == 'pptx' or file_type == 'ppt': + return PowerPointProvider + elif file_type == 'epub': + return EpubProvider + elif file_type == 'html': + return HTMLProvider + + # If file_type is not explicitly handled, fall back to content-based detection if filetype.image_match(filepath) is not None: return ImageProvider if file_match(filepath, load_matchers("pdf")) is not None: diff --git a/marker/renderers/__init__.py b/marker/renderers/__init__.py index 2a8cbe77..a6dccb1e 100644 --- a/marker/renderers/__init__.py +++ b/marker/renderers/__init__.py @@ -13,9 +13,17 @@ from marker.settings import settings from marker.util import assign_config +# Import OSS uploader +from marker.oss_uploader import S3Client +s3_client = S3Client() +S3_AVAILABLE = True + class BaseRenderer: - image_blocks: Annotated[Tuple[BlockTypes, ...], "The block types to consider as images."] = (BlockTypes.Picture, BlockTypes.Figure) + image_blocks: Annotated[ + Tuple[BlockTypes, ...], + "The block types to consider as images." + ] = (BlockTypes.Picture, BlockTypes.Figure, BlockTypes.Molecule, BlockTypes.MoleculeTable) extract_images: Annotated[bool, "Extract images from the document."] = True image_extraction_mode: Annotated[ Literal["lowres", "highres"], @@ -31,15 +39,83 @@ def __call__(self, document): raise NotImplementedError def extract_image(self, document: Document, image_id, to_base64=False): + print(f"🖼️ [DEBUG] BaseRenderer.extract_image() called for {image_id}") + image_block = document.get_block(image_id) + if image_block is None: + print(f"❌ [DEBUG] Image block not found for {image_id}") + return None + + print(f"✅ [DEBUG] Found image block: {type(image_block).__name__} (type: {image_block.block_type})") + cropped = image_block.get_image(document, highres=self.image_extraction_mode == "highres") + + if cropped is None: + print(f"❌ [DEBUG] Failed to get image from block {image_id}") + return None + + print(f"✅ [DEBUG] Got cropped image: {cropped.size}") if to_base64: image_buffer = io.BytesIO() cropped.save(image_buffer, format=settings.OUTPUT_IMAGE_FORMAT) cropped = base64.b64encode(image_buffer.getvalue()).decode(settings.OUTPUT_ENCODING) + print(f"✅ [DEBUG] Converted to base64 (length: {len(cropped)})") + return cropped + def upload_image_to_s3(self, image, image_name, image_type="image", metadata=None): + """ + Upload image to S3 if available, otherwise return None + + Args: + image: PIL Image object + image_name: Original image name + image_type: Type of image (image, molecule, etc.) + metadata: Additional metadata (currently not used in S3 implementation) + + Returns: + S3 upload result dict with 'url' and 'key', or None + """ + print(f"☁️ [DEBUG] upload_image_to_s3() called: {image_name} (type: {image_type})") + + if image is None: + print(f"❌ [DEBUG] Image is None, cannot upload") + return None + + print(f"🖼️ [DEBUG] Image size: {image.size}") + + if S3_AVAILABLE and s3_client: + try: + print(f"✅ [DEBUG] S3 client available, starting upload...") + + # Convert PIL Image to bytes + img_buffer = io.BytesIO() + image.save(img_buffer, format='JPEG', quality=85, optimize=True) + image_data = img_buffer.getvalue() + + print(f"📦 [DEBUG] Image converted to bytes: {len(image_data)} bytes") + + # Upload to S3 + result = s3_client.s3_upload_from_file(image_name, image_data) + + if result and 'url' in result: + print(f"✅ [DEBUG] Image uploaded successfully: {result['url']}") + return result + else: + print(f"❌ [DEBUG] Failed to upload image: {image_name}, result: {result}") + return None + + except Exception as e: + print(f"❌ [DEBUG] Failed to upload image to S3: {e}") + import traceback + traceback.print_exc() + return None + else: + print(f"❌ [DEBUG] S3 not available (S3_AVAILABLE: {S3_AVAILABLE}, s3_client: {s3_client is not None})") + + return None + @staticmethod def merge_consecutive_math(html, tag="math"): if not html: @@ -53,36 +129,42 @@ def merge_consecutive_math(html, tag="math"): @staticmethod def merge_consecutive_tags(html, tag): - if not html: - return html - - def replace_whitespace(match): - whitespace = match.group(1) - if len(whitespace) == 0: - return "" - else: - return " " - - pattern = fr'(\s*)<{tag}>' - - while True: - new_merged = re.sub(pattern, replace_whitespace, html) - if new_merged == html: - break - html = new_merged - + pattern = f'(\s*)<{tag}>' + html = re.sub(pattern, r'\1', html) return html + def get_page_footer(self, page: any): + try: + for block in page.children: + if block.block_type == BlockTypes.PageFooter: + return block.raw_text(page) + except Exception as e: + print('get_page_footer', e, flush=True) + return '' + + def get_page_header(self, page: any): + try: + for block in page.children: + if block.block_type == BlockTypes.PageHeader: + return block.raw_text(page) + except Exception as e: + print('get_page_header', e, flush=True) + return '' + def generate_page_stats(self, document: Document, document_output): page_stats = [] for page in document.pages: block_counts = Counter([str(block.block_type) for block in page.children]).most_common() block_metadata = page.aggregate_block_metadata() + page_header = self.get_page_header(page) + page_footer = self.get_page_footer(page) page_stats.append({ "page_id": page.page_id, "text_extraction_method": page.text_extraction_method, "block_counts": block_counts, - "block_metadata": block_metadata.model_dump() + "block_metadata": block_metadata.model_dump(), + "page_header": page_header, + "page_footer": page_footer }) return page_stats @@ -113,12 +195,52 @@ def extract_block_html(self, document: Document, block_output: BlockOutput): break if ref_block_id.block_type in self.image_blocks and self.extract_images: - images[ref_block_id] = self.extract_image(document, ref_block_id, to_base64=True) + image = self.extract_image(document, ref_block_id, to_base64=False) + image_name = f"{ref_block_id.to_path()}.{settings.OUTPUT_IMAGE_FORMAT.lower()}" + + # Try to upload to S3 first + image_block = document.get_block(ref_block_id) + metadata = {} + + # Check if it's a molecule image + if ref_block_id.block_type == BlockTypes.Molecule: + if hasattr(image_block, 'structure_data') and image_block.structure_data: + metadata = image_block.structure_data + s3_result = self.upload_image_to_s3(image, image_name, "molecule", metadata) + else: + s3_result = self.upload_image_to_s3(image, image_name, "image", metadata) + + if s3_result: + # Store S3 URL information + images[ref_block_id] = {"url": s3_result['url'], "type": "s3", "original_name": image_name} + else: + # Fall back to base64 + images[ref_block_id] = self.extract_image(document, ref_block_id, to_base64=True) else: images.update(sub_images) ref.replace_with(BeautifulSoup(content, 'html.parser')) if block_output.id.block_type in self.image_blocks and self.extract_images: - images[block_output.id] = self.extract_image(document, block_output.id, to_base64=True) + image = self.extract_image(document, block_output.id, to_base64=False) + image_name = f"{block_output.id.to_path()}.{settings.OUTPUT_IMAGE_FORMAT.lower()}" + + # Try to upload to S3 first + image_block = document.get_block(block_output.id) + metadata = {} + + # Check if it's a molecule image + if block_output.id.block_type == BlockTypes.Molecule: + if hasattr(image_block, 'structure_data') and image_block.structure_data: + metadata = image_block.structure_data + s3_result = self.upload_image_to_s3(image, image_name, "molecule", metadata) + else: + s3_result = self.upload_image_to_s3(image, image_name, "image", metadata) + + if s3_result: + # Store S3 URL information + images[block_output.id] = {"url": s3_result['url'], "type": "s3", "original_name": image_name} + else: + # Fall back to base64 + images[block_output.id] = self.extract_image(document, block_output.id, to_base64=True) return str(soup), images diff --git a/marker/renderers/html.py b/marker/renderers/html.py index afe76c2b..170d3d2f 100644 --- a/marker/renderers/html.py +++ b/marker/renderers/html.py @@ -18,6 +18,11 @@ # Suppress DecompressionBombError Image.MAX_IMAGE_PIXELS = None +# Import OSS uploader +from marker.oss_uploader import S3Client +s3_client = S3Client() +S3_AVAILABLE = True + class HTMLOutput(BaseModel): html: str @@ -43,12 +48,48 @@ def extract_image(self, document, image_id): cropped = image_block.get_image(document, highres=self.image_extraction_mode == "highres") return cropped + def upload_image_to_s3(self, image, image_name, image_type="image", metadata=None): + """ + Upload image to S3 if available, otherwise return None + + Args: + image: PIL Image object + image_name: Original image name + image_type: Type of image (image, molecule, etc.) + metadata: Additional metadata (currently not used in S3 implementation) + + Returns: + S3 upload result dict with 'url' and 'key', or None + """ + if S3_AVAILABLE and s3_client: + try: + # Convert PIL Image to bytes + import io + img_buffer = io.BytesIO() + image.save(img_buffer, format='JPEG', quality=85, optimize=True) + image_data = img_buffer.getvalue() + + # Upload to S3 + result = s3_client.s3_upload_from_file(image_name, image_data) + + if result and 'url' in result: + print(f"✅ Image uploaded successfully: {result['url']}") + return result + else: + print(f"❌ Failed to upload image: {image_name}") + return None + + except Exception as e: + print(f"Failed to upload image to S3: {e}") + return None + return None + def extract_html(self, document, document_output, level=0): soup = BeautifulSoup(document_output.html, 'html.parser') - content_refs = soup.find_all('content-ref') ref_block_id = None images = {} + for ref in content_refs: src = ref.get('src') sub_images = {} @@ -64,8 +105,99 @@ def extract_html(self, document, document_output, level=0): if self.extract_images: image = self.extract_image(document, ref_block_id) image_name = f"{ref_block_id.to_path()}.{settings.OUTPUT_IMAGE_FORMAT.lower()}" - images[image_name] = image - ref.replace_with(BeautifulSoup(f"

{content}

", 'html.parser')) + + # Try to upload to S3 first + image_block = document.get_block(ref_block_id) + metadata = {} + img_id = str(ref_block_id) + img_tag = "" # Initialize img_tag + + # Check if it's a molecule image + if ref_block_id.block_type == BlockTypes.Molecule: + if hasattr(image_block, 'structure_data') and image_block.structure_data: + metadata = image_block.structure_data + s3_result = self.upload_image_to_s3(image, image_name, "molecule", metadata) + # print("s3_result: ", s3_result, img_id) + if s3_result: + # Use S3 URL and custom molecule tag + images[img_id] = { + "url": s3_result['url'], + "type": "s3", + "key": s3_result['key'], + 'extra_type': 'molecule_img', + "smiles": image_block.structure_data.get('smiles', ''), + "mol_block": image_block.structure_data.get('mol_block', ''), + "label": image_block.structure_data.get('label', ''), + "page_idx": image_block.structure_data.get('page_idx', ''), + "bbox": image_block.structure_data.get('bbox', []) + } + img_tag = f'Molecule {img_id}' + else: + # Fall back to local molecule image + images[img_id] = image + img_tag = f'Molecule {img_id}' + + elif ref_block_id.block_type == BlockTypes.MoleculeTable: + # s3_result = self.upload_image_to_s3(image, image_name, "molecule_table", metadata) + + # if s3_result: + # Use S3 URL and custom molecule table tag + images[img_id] = { + "url": '', + "type": "s3", + "key": '', + "extra_type": "molecule_table", + "html_content": image_block.html, + "page_idx": image_block.structure_data.get('page_idx', ''), + "bbox": image_block.structure_data.get('bbox', []) + } + img_tag = f'Molecule Table {img_id}' + + elif ref_block_id.block_type == BlockTypes.Picture: + # Handle pictures + s3_result = self.upload_image_to_s3(image, image_name, "picture", metadata) + + if s3_result: + # Use S3 URL and custom picture tag + images[img_id] = {"url": s3_result['url'], "type": "s3", "key": s3_result['key'], 'extra_type': 'picture'} + img_tag = f'Picture {img_id}' + else: + # Fall back to local picture image + images[img_id] = image + img_tag = f'Picture {img_id}' + + elif ref_block_id.block_type == BlockTypes.Figure: + # Handle figures + s3_result = self.upload_image_to_s3(image, image_name, "figure", metadata) + + if s3_result: + # Use S3 URL and custom figure tag + images[img_id] = {"url": s3_result['url'], "type": "s3", "key": s3_result['key'], 'extra_type': 'picture'} + + img_tag = f'' + else: + # Fall back to local figure image + images[img_id] = image + img_tag = f'' + else: + # Other image types + s3_result = self.upload_image_to_s3(image, image_name, "image", metadata) + + if s3_result: + # Use S3 URL and custom image tag + images[img_id] = {"url": s3_result['url'], "type": "s3", "key": s3_result['key']} + + img_tag = f'Image {img_id}' + else: + # Fall back to standard markdown image + images[img_id] = image + img_tag = f'Image {img_id}' + + # Replace the content-ref with the content and image tag + replacement_html = f"

{content}

" if content else f"

{img_tag}

" + # replacement_html = f"{img_tag}" + + ref.replace_with(BeautifulSoup(replacement_html, 'html.parser')) else: # This will be the image description if using llm mode, or empty if not ref.replace_with(BeautifulSoup(f"{content}", 'html.parser')) @@ -94,6 +226,7 @@ def extract_html(self, document, document_output, level=0): """) + print("@@@ images: ", images) return output, images diff --git a/marker/renderers/markdown.py b/marker/renderers/markdown.py index 668377ed..3c971af7 100644 --- a/marker/renderers/markdown.py +++ b/marker/renderers/markdown.py @@ -57,6 +57,7 @@ def __init__(self, paginate_output, page_separator, inline_math_delimiters, bloc self.page_separator = page_separator self.inline_math_delimiters = inline_math_delimiters self.block_math_delimiters = block_math_delimiters + self.image_mappings = {} # Store img_id -> url mappings def convert_div(self, el, text, convert_as_inline): is_page = el.has_attr('class') and el['class'][0] == 'page' @@ -192,6 +193,8 @@ def escape(self, text): class MarkdownOutput(BaseModel): markdown: str images: dict + mol_images: dict + table_contents: dict metadata: dict @@ -217,14 +220,78 @@ def md_cls(self): block_math_delimiters=self.block_math_delimiters ) + def process_images_for_markdown(self, images): + """ + Process images dict to handle S3 URLs and local images appropriately for Markdown + + Args: + images: Dict containing image data (either URLs or binary data) + + Returns: + Processed images dict suitable for Markdown output + """ + processed_images = {} + picture_images = {} + molecule_img_images = {} + molecule_table_images = {} + + for key, value in images.items(): + if isinstance(value, dict) and value.get("extra_type", "") == "molecule_img": + # For S3 images, we store the URL info but don't include binary data + molecule_img_images[key] = { + "url": value["url"], + "type": "s3", + "key": value.get("key", ""), + "extra_type": value.get("extra_type", ""), + "smiles": value.get("smiles", ""), + "mol_block": value.get("mol_block", ""), + "label": value.get("label", ""), + "page_idx": value.get("page_idx", ""), + "bbox": value.get("bbox", []), + "original_name": value.get("original_name", str(key)) + } + elif isinstance(value, dict) and value.get("extra_type", "") == "molecule_table": + molecule_table_images[key] = { + "url": value["url"], + "type": "s3", + "key": value.get("key", ""), + "extra_type": value.get("extra_type", ""), + "html_content": value.get("html_content", ""), + "page_idx": value.get("page_idx", ""), + "bbox": value.get("bbox", ""), + "original_name": value.get("original_name", str(key)) + } + elif isinstance(value, dict) and value.get("extra_type", "") == "picture": + picture_images[key] = { + "url": value["url"], + "type": "s3", + "key": value.get("key", ""), + "extra_type": value.get("extra_type", ""), + "original_name": value.get("original_name", str(key)) + } + else: + # For local/base64 images, keep as is + processed_images[key] = value + + return processed_images, picture_images, molecule_img_images, molecule_table_images def __call__(self, document: Document) -> MarkdownOutput: document_output = document.render() full_html, images = self.extract_html(document, document_output) + + # Extract image mappings and table contents from images dict + # Process images for Markdown + _, picture_images, molecule_img_images, molecule_table_images = self.process_images_for_markdown(images) + print("@@@@ picture_images: ", picture_images) + print("@@@@ molecule_img_images: ", molecule_img_images) + print("@@@@ molecule_table_images: ", molecule_table_images) markdown = self.md_cls.convert(full_html) markdown = cleanup_text(markdown) + return MarkdownOutput( markdown=markdown, - images=images, + images=picture_images, + mol_images=molecule_img_images, + table_contents=molecule_table_images, metadata=self.generate_document_metadata(document, document_output) ) diff --git a/marker/schema/__init__.py b/marker/schema/__init__.py index ece0968e..8c8ce03e 100644 --- a/marker/schema/__init__.py +++ b/marker/schema/__init__.py @@ -29,6 +29,8 @@ class BlockTypes(str, Enum): ComplexRegion = auto() TableCell = auto() Reference = auto() + Molecule = auto() + MoleculeTable = auto() def __str__(self): return self.name diff --git a/marker/schema/blocks/__init__.py b/marker/schema/blocks/__init__.py index 7fe5aec6..efd01499 100644 --- a/marker/schema/blocks/__init__.py +++ b/marker/schema/blocks/__init__.py @@ -20,3 +20,4 @@ from marker.schema.blocks.complexregion import ComplexRegion from marker.schema.blocks.tablecell import TableCell from marker.schema.blocks.reference import Reference +from marker.schema.blocks.molecule import Molecule, MoleculeTable \ No newline at end of file diff --git a/marker/schema/blocks/figure.py b/marker/schema/blocks/figure.py index 83c9aea5..655faf34 100644 --- a/marker/schema/blocks/figure.py +++ b/marker/schema/blocks/figure.py @@ -8,8 +8,26 @@ class Figure(Block): block_description: str = "A chart or other image that contains data." def assemble_html(self, document, child_blocks, parent_structure): + print(f"@@@ Figure.assemble_html called - id: {self.id}, description: {self.description}") child_ref_blocks = [block for block in child_blocks if block.id.block_type == BlockTypes.Reference] + print(f"@@@ Figure child_ref_blocks count: {len(child_ref_blocks)}") html = super().assemble_html(document, child_ref_blocks, parent_structure) + print(f"@@@ Figure super().assemble_html returned: '{html}'") + + # Use consistent placeholder ID format matching HTMLRenderer + imgid = str(self.id) + print("@@@@####figure imgid: ", imgid) + if self.description: - html += f"

Image {self.id} description: {self.description}

" - return html + # Include both placeholder and description + placeholder = f"

_placeholder_figid_{imgid}

" + description = f"

Image {self.id} description: {self.description}

" + print(f"@@@ Figure has description: '{self.description}'") + print(f"@@@ Generated placeholder: '{placeholder}'") + print(f"@@@ Generated description: '{description}'") + # return html + placeholder + description + # else: + + final_result = f"

_placeholder_imgid_{imgid}

" + print(f"@@@ Figure.assemble_html returning: '{final_result}'") + return final_result diff --git a/marker/schema/blocks/molecule.py b/marker/schema/blocks/molecule.py new file mode 100644 index 00000000..ae9efc8a --- /dev/null +++ b/marker/schema/blocks/molecule.py @@ -0,0 +1,170 @@ +from marker.schema import BlockTypes +from marker.schema.blocks import Block + + +class Molecule(Block): + block_type: BlockTypes = BlockTypes.Molecule + block_description: str = "A chemical molecule structure or formula." + html: str | None = None + replace_output_newlines: bool = True + structure_data: dict = {} + confidence: float = 1.0 + + def get_image(self, document, highres=True): + """ + Extract image of the molecule from the document + + Args: + document: Document object + highres: Whether to use high resolution image + + Returns: + PIL Image of the molecule + """ + # Get the page containing this molecule + page = document.get_page(self.page_id) + if page is None: + print(f"❌ [DEBUG] Page {self.page_id} not found in document") + return None + + # Get the page image + page_image = page.get_image() + if page_image is None: + print(f"❌ [DEBUG] Failed to get page image for page {self.page_id}") + return None + + # Crop the molecule region from the page image + bbox = self.polygon.bbox + print('self.polygon.bbox', self.polygon, bbox, flush=True) + print('image', page_image.size, flush=True) + + if len(bbox) >= 4: + # bbox format: [x1, y1, x2, y2] - these are relative coordinates (0-1) + x1, y1, x2, y2 = bbox + ori_width = page.polygon.width + ori_height = page.polygon.height + x1 = x1 / ori_width * page_image.width + y1 = y1 / ori_height * page_image.height + x2 = x2 / ori_width * page_image.width + y2 = y2 / ori_height * page_image.height + + # Check if crop area is valid + if x2 <= x1 or y2 <= y1: + print(f"❌ [DEBUG] Invalid crop area: width={x2-x1}, height={y2-y1}") + return None + + # Crop the image + cropped = page_image.crop((x1, y1, x2, y2)) + + return cropped + else: + print(f"❌ [DEBUG] Invalid bbox length: {len(bbox)}") + + return None + + def assemble_html(self, document, child_blocks, parent_structure): + # Use consistent placeholder ID format matching HTMLRenderer + imgid = str(self.id) + + # 如果有自定义html + if self.html: + return f"

{self.html}

" + + # 如果有结构数据中的内容 + if self.structure_data.get('content'): + return f"

{self.structure_data['content']}

" + + # 如果有SMILES数据 + if self.structure_data.get('smiles'): + smiles = self.structure_data['smiles'] + return f"

_placeholder_molid_{imgid}_label_{self.structure_data.get('label', '')}_smiles_{smiles}

" + + # 默认情况使用placeholder + return f"

_placeholder_molid_{imgid}

" + + +class MoleculeTable(Block): + block_type: BlockTypes = BlockTypes.MoleculeTable + block_description: str = "A table containing chemical molecules or molecular data." + html: str | None = None + replace_output_newlines: bool = True + structure_data: dict = {} + table_data: dict = {} + confidence: float = 1.0 + + def get_image(self, document, highres=True): + """ + Extract image of the molecule table from the document + + Args: + document: Document object + highres: Whether to use high resolution image + + Returns: + PIL Image of the molecule table + """ + # Get the page containing this table + page = document.get_page(self.page_id) + if page is None: + print(f"❌ [DEBUG] Page {self.page_id} not found in document") + return None + + # Get the page image + page_image = page.get_image(highres=highres) + if page_image is None: + print(f"❌ [DEBUG] Failed to get page image for page {self.page_id}") + return None + + # Crop the table region from the page image + bbox = self.polygon.bbox + + if len(bbox) >= 4: + # bbox format: [x1, y1, x2, y2] - these are relative coordinates (0-1) + x1_rel, y1_rel, x2_rel, y2_rel = bbox + + # Convert relative coordinates to absolute coordinates + page_width, page_height = page_image.size + x1 = x1_rel * page_width + y1 = y1_rel * page_height + x2 = x2_rel * page_width + y2 = y2_rel * page_height + + # Ensure coordinates are within image bounds and are integers + x1 = max(0, int(x1)) + y1 = max(0, int(y1)) + x2 = min(page_width, int(x2)) + y2 = min(page_height, int(y2)) + + # Check if crop area is valid + if x2 <= x1 or y2 <= y1: + print(f"❌ [DEBUG] Invalid crop area: width={x2-x1}, height={y2-y1}") + return None + + # Crop the image + cropped = page_image.crop((x1, y1, x2, y2)) + return cropped + else: + print(f"❌ [DEBUG] Invalid bbox length: {len(bbox)}") + + return None + + def assemble_html(self, document, child_blocks, parent_structure): + # Use consistent placeholder ID format matching HTMLRenderer + imgid = str(self.id) + + # 如果是mock数据,输出固定内容 + if self.table_data.get('mock', False): + return f"

_placeholder_tableid_{imgid}

" + + # 如果有自定义html + if self.html: + # return f"{self.html}" + return f"

_placeholder_tableid_{imgid}

" + + # 如果有表格数据中的内容 + if self.table_data.get('content'): + # return f"{self.table_data['content']}
" + return f"

_placeholder_tableid_{imgid}

" + + # Use placeholder for default case + return f"

_placeholder_tableid_{imgid}

" \ No newline at end of file diff --git a/marker/schema/blocks/picture.py b/marker/schema/blocks/picture.py index 6f815516..ba2f9a9c 100644 --- a/marker/schema/blocks/picture.py +++ b/marker/schema/blocks/picture.py @@ -11,6 +11,15 @@ def assemble_html(self, document, child_blocks, parent_structure): child_ref_blocks = [block for block in child_blocks if block.id.block_type == BlockTypes.Reference] html = super().assemble_html(document, child_ref_blocks, parent_structure) + # Use consistent placeholder ID format matching HTMLRenderer + imgid = str(self.id) + print("@@@@####picture imgid: ", imgid) + if self.description: - return html + f"

Image {self.id} description: {self.description}

" - return html + # Include both placeholder and description + placeholder = f"

_placeholder_imgid_{imgid}

" + description = f"

Image {self.id} description: {self.description}

" + return html + placeholder + description + # else: + # Just placeholder + return f"

_placeholder_imgid_{imgid}

" diff --git a/marker/schema/document.py b/marker/schema/document.py index 1ff21060..a9d44632 100644 --- a/marker/schema/document.py +++ b/marker/schema/document.py @@ -23,7 +23,7 @@ class TocItem(BaseModel): class Document(BaseModel): - filepath: str + filepath: str | bytes pages: List[PageGroup] block_type: BlockTypes = BlockTypes.Document table_of_contents: List[TocItem] | None = None diff --git a/marker/schema/groups/page.py b/marker/schema/groups/page.py index 5a176073..8acfd71a 100644 --- a/marker/schema/groups/page.py +++ b/marker/schema/groups/page.py @@ -22,6 +22,7 @@ class PageGroup(Group): # This is bytes if it is serialized lowres_image: Image.Image | None | bytes = None highres_image: Image.Image | None | bytes = None + original_image: Image.Image | None | bytes = None children: List[Union[Any, Block]] | None = None layout_sliced: bool = ( False # Whether the layout model had to slice the image (order may be wrong) @@ -50,10 +51,17 @@ def get_image( self, *args, highres: bool = False, + original: bool = False, remove_blocks: Sequence[BlockTypes] | None = None, **kwargs, ): - image = self.highres_image if highres else self.lowres_image + if original: + image = self.original_image + # Fallback to highres if original is not available + if image is None: + image = self.highres_image + else: + image = self.highres_image if highres else self.lowres_image # Avoid double OCR for certain elements if remove_blocks: @@ -74,6 +82,8 @@ def get_image( @computed_field @property def current_children(self) -> List[Block]: + if self.children is None: + return [] return [child for child in self.children if not child.removed] def get_next_block( diff --git a/marker/schema/registry.py b/marker/schema/registry.py index d667fb2d..0cec54e5 100644 --- a/marker/schema/registry.py +++ b/marker/schema/registry.py @@ -7,6 +7,7 @@ ListItem, PageFooter, PageHeader, Picture, \ SectionHeader, Table, TableOfContents, \ Text, ComplexRegion, TableCell, Reference +from marker.schema.blocks.molecule import Molecule, MoleculeTable from marker.schema.document import Document from marker.schema.groups import FigureGroup, ListGroup, PageGroup, \ PictureGroup, TableGroup @@ -30,6 +31,8 @@ def get_block_class(block_type: BlockTypes) -> Type[Block]: register_block_class(BlockTypes.Span, Span) register_block_class(BlockTypes.FigureGroup, FigureGroup) register_block_class(BlockTypes.TableGroup, TableGroup) +register_block_class(BlockTypes.Molecule, Molecule) +register_block_class(BlockTypes.MoleculeTable, MoleculeTable) register_block_class(BlockTypes.ListGroup, ListGroup) register_block_class(BlockTypes.PictureGroup, PictureGroup) register_block_class(BlockTypes.Page, PageGroup) diff --git a/marker/services/gemini.py b/marker/services/gemini.py index 990a6b4a..1c3ac973 100644 --- a/marker/services/gemini.py +++ b/marker/services/gemini.py @@ -9,6 +9,7 @@ from google.genai.errors import APIError from pydantic import BaseModel +import httpx from marker.schema.blocks import Block from marker.services import BaseService @@ -43,7 +44,7 @@ def __call__( if not isinstance(image, list): image = [image] - + print('ssssss', flush=True) client = self.get_google_client(timeout=timeout) image_parts = [types.Part.from_bytes(data=self.img_to_bytes(img), mime_type="image/webp") for img in image] @@ -84,10 +85,14 @@ class GoogleGeminiService(BaseGeminiService): gemini_api_key: Annotated[ str, "The Google API key to use for the service." - ] = None + ] = 'AIzaSyCguwG1QrgHvrbJwT5g5S7IIs3yZTUOaEg' + + # import os + # os.environ["HTTP_PROXY"] = "http://172.21.0.16:7890" + # os.environ["HTTPS_PROXY"] = "http://172.21.0.16:7890" def get_google_client(self, timeout: int): return genai.Client( api_key=self.gemini_api_key, - http_options={"timeout": timeout * 1000} # Convert to milliseconds + http_options={"timeout": timeout * 1000}, # Convert to milliseconds ) diff --git a/marker/utils.py b/marker/utils.py new file mode 100644 index 00000000..95fdabf4 --- /dev/null +++ b/marker/utils.py @@ -0,0 +1,25 @@ +import torch +from marker.settings import settings +import requests +import traceback +from typing import Any +import threading + + +def flush_cuda_memory(): + if settings.TORCH_DEVICE_MODEL == "cuda": + torch.cuda.empty_cache() + + +def send_callback(callback_url: str, result: Any): + threading.Thread(target=send_callback_inner, args=(callback_url, result)).start() + + +def send_callback_inner(url: str, result: Any): + try: + print('callback url: ', url, flush=True) + response = requests.post(url, json=result) + print(f"Callback response status: {response.text}", flush=True) + except Exception as e: + traceback.print_exc() + print(f"Callback failed: {e}", flush=True) diff --git a/marker_main.py b/marker_main.py new file mode 100644 index 00000000..961b8790 --- /dev/null +++ b/marker_main.py @@ -0,0 +1,290 @@ +import time +import torch +import os +from datetime import datetime +from marker.utils import send_callback, flush_cuda_memory +import pytz +# 获取北京时区 +beijing_tz = pytz.timezone('Asia/Shanghai') +os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # For some reason, transformers decided to use .isin for a simple op, which is not supported on MPS + +# from marker.convert import convert_single_pdf +from marker.config.parser import ConfigParser +from marker.converters.pdf import PdfConverter +from marker.settings import settings +from marker.logger import configure_logging +from marker.models import create_model_dict + +configure_logging() + + +class ExtractionProc: + def __init__(self, config=None): + self.model_lst = [] + + def load_models(self): + # self.model_lst = load_all_models(torch.device("cuda"), dtype=torch.float16) + self.model_dict = create_model_dict() + + def parse_docx(self, file_byte): + # 使用python-docx库解析docx文件 + from io import BytesIO + from docx import Document + + # 创建BytesIO对象 + docx_file = BytesIO(file_byte) + + # 加载docx文件 + doc = Document(docx_file) + + # 提取所有段落中的文本 + full_text = [] + for para in doc.paragraphs: + if para.text.strip(): + full_text.append(para.text) + + # 提取表格中的文本 + for table in doc.tables: + for row in table.rows: + for cell in row.cells: + if cell.text.strip(): + full_text.append(cell.text) + + # 将所有文本拼接成一个字符串 + text = "\n".join(full_text) + return { + 'text': text, + 'images': {}, + 'info': {'table_count': 0, 'formula_count': 0, 'ocr_count': 0}, + 'metadata': {}, + 'mol_images': {}, + 'table_contents': {} + } + + def parse_pptx(self, file_byte): + # 使用python-pptx库解析pptx文件 + from io import BytesIO + from pptx import Presentation + + # 创建BytesIO对象 + pptx_file = BytesIO(file_byte) + + # 加载pptx文件 + prs = Presentation(pptx_file) + + # 提取所有幻灯片中的文本 + full_text = [] + for slide in prs.slides: + for shape in slide.shapes: + if hasattr(shape, "text"): + full_text.append(shape.text) + + # 将所有文本拼接成一个字符串 + text = "\n".join(full_text) + return { + 'text': text, + 'images': {}, + 'info': {'table_count': 0, 'formula_count': 0, 'ocr_count': 0}, + 'metadata': {}, + 'mol_images': {}, + 'table_contents': {} + } + + def parse_txt(self, file_byte): + # 这个不需解析,返回文本 + full_text = file_byte.decode('utf-8') + return { + 'text': full_text, + 'images': {}, + 'info': {'table_count': 0, 'formula_count': 0, 'ocr_count': 0}, + 'metadata': {}, + 'mol_images': {}, + 'table_contents': {} + } + + def parse_docx_direct(self, file_byte): + """Parse DOCX directly to markdown, bypassing PDF conversion for better accuracy""" + from io import BytesIO + from markdownify import markdownify as md + import mammoth + import re + import base64 + + def convert_image_to_base64(image): + """Convert mammoth image to base64 data URI""" + try: + with image.open() as image_bytes: + return "data:" + image.content_type + ";base64," + base64.b64encode(image_bytes.read()).decode() + except Exception as e: + print(f"Failed to convert image: {e}") + return "" + + # 创建BytesIO对象 + docx_file = BytesIO(file_byte) + + # Configure style mapping to preserve heading levels + style_map = """ + p[style-name='Heading 1'] => h1:fresh + p[style-name='Heading 2'] => h2:fresh + p[style-name='Heading 3'] => h3:fresh + p[style-name='Heading 4'] => h4:fresh + p[style-name='Heading 5'] => h5:fresh + p[style-name='Heading 6'] => h6:fresh + p[style-name='标题 1'] => h1:fresh + p[style-name='标题 2'] => h2:fresh + p[style-name='标题 3'] => h3:fresh + p[style-name='标题 4'] => h4:fresh + p[style-name='标题 5'] => h5:fresh + p[style-name='标题 6'] => h6:fresh + """ + + # Configure mammoth options for better conversion + convert_options = { + "convert_image": mammoth.images.img_element(lambda image: { + "src": convert_image_to_base64(image) + }), + "ignore_empty_paragraphs": False, + "style_map": style_map + } + + # Convert DOCX to HTML + result = mammoth.convert_to_html(docx_file, **convert_options) + html = result.value + + # Print conversion messages if any + if result.messages: + print(f"Mammoth conversion messages: {result.messages}", flush=True) + + print(f"Generated HTML length: {len(html)} characters", flush=True) + if html: + preview = html[:500].replace('\n', ' ') + print(f"HTML preview: {preview}...", flush=True) + + # Convert HTML directly to markdown + markdown_text = md( + html, + heading_style="ATX", # Use # ## ### style + wrap=True, + wrap_width=80, + convert=['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'strong', 'em', 'ul', 'ol', 'li', 'table', 'tr', 'td', 'th'] + ) + + print(f"Generated markdown length: {len(markdown_text)} characters", flush=True) + print(f"Markdown preview: {markdown_text[:500]}...", flush=True) + + # Extract table of contents from HTML + def extract_toc_from_html(html): + toc = [] + heading_pattern = r']*>([^<]+)' + matches = re.findall(heading_pattern, html) + + for level, title in matches: + toc.append({ + "title": title.strip(), + "heading_level": int(level), + "page_id": 0, + "polygon": [[0, 0], [100, 0], [100, 20], [0, 20]] # Dummy coordinates + }) + + return toc + + return { + 'text': markdown_text, + 'images': {}, # TODO: Extract and process images if needed + 'info': {'table_count': html.count('')], ['SectionHeader', sum(html.count(f'>>>>>> 7b3253e3dcdde315039ef0dc00a327f6a7ad50b5 + ] + ] + }, + { + "title": "article info", + "heading_level": null, + "page_id": 0, + "polygon": [ + [ +<<<<<<< HEAD + 41.39697265625, + 324.8083190917969 + ], + [ + 139.61474609375, + 324.8083190917969 + ], + [ + 139.61474609375, + 334.193359375 + ], + [ + 41.39697265625, + 334.193359375 +======= + 41.33938321471214, + 324.8083190917969 + ], + [ + 139.41749703884125, + 324.8083190917969 + ], + [ + 139.41749703884125, + 334.13311767578125 + ], + [ + 41.33938321471214, + 334.13311767578125 +>>>>>>> 7b3253e3dcdde315039ef0dc00a327f6a7ad50b5 + ] + ] + }, + { + "title": "abstract", + "heading_level": null, + "page_id": 0, + "polygon": [ + [ + 206.2373239994049, + 324.8083190917969 + ], + [ + 275.1947765350342, + 324.8083190917969 + ], + [ + 275.1947765350342, + 334.3464592695236 + ], + [ + 206.2373239994049, + 334.3464592695236 + ] + ] + }, + { + "title": "1. Introduction", + "heading_level": null, + "page_id": 0, + "polygon": [ + [ + 42.04456412792206, + 509.4381408691406 + ], + [ + 105.00094729661942, + 508.91597557067865 + ], + [ + 105.00094729661942, + 517.7269287109375 + ], + [ + 42.04456412792206, + 518.6245439052582 + ] + ] + }, + { + "title": "2. Results and discussion", + "heading_level": null, + "page_id": 2, + "polygon": [ + [ + 41.75015592575073, + 129.14129638671875 + ], + [ + 144.22161865234375, + 129.14129638671875 + ], + [ + 144.22161865234375, + 137.43011474609375 + ], + [ + 41.75015592575073, + 137.43011474609375 + ] + ] + }, + { + "title": "2.1. Chemistry", + "heading_level": null, + "page_id": 2, + "polygon": [ + [ + 41.641335904598236, + 150.0606689453125 + ], + [ + 95.70146942138672, + 150.0606689453125 + ], + [ + 95.70146942138672, + 158.76501655578613 + ], + [ + 41.641335904598236, + 158.76501655578613 + ] + ] + }, + { + "title": "2.2. The inhibitory activity of synthetic compounds on FGFR1 kinase", + "heading_level": null, + "page_id": 2, + "polygon": [ + [ +<<<<<<< HEAD + 41.83349609375, + 495.08691406249994 +======= + 41.87101870775223, + 494.9530285596847 +>>>>>>> 7b3253e3dcdde315039ef0dc00a327f6a7ad50b5 + ], + [ + 287.6133117675781, + 494.9530285596847 + ], + [ + 287.6133117675781, + 503.55340576171875 + ], + [ +<<<<<<< HEAD + 41.83349609375, +======= + 41.87101870775223, +>>>>>>> 7b3253e3dcdde315039ef0dc00a327f6a7ad50b5 + 503.55340576171875 + ] + ] + }, + { + "title": "2.3. Quantitative evaluation of structure-activity relationship\n(QSAR)", + "heading_level": null, + "page_id": 2, + "polygon": [ + [ + 311.0528962612152, + 285.9808774590492 + ], + [ + 537.757762670517, + 285.9808774590492 + ], + [ + 537.757762670517, + 304.7886962890625 + ], + [ + 311.0528962612152, + 304.7886962890625 + ] + ] + }, + { + "title": "2.4. D12 and D15 inhibit FGFR1 through an ATP-independent\nmechanism", + "heading_level": null, + "page_id": 2, + "polygon": [ + [ + 311.52783203125, + 545.7848968505859 + ], + [ + 539.9238965511322, + 545.7848968505859 + ], + [ + 539.9238965511322, + 566.3106231689453 + ], + [ + 311.52783203125, + 566.3106231689453 + ] + ] + }, + { + "title": "2.5. Molecular docking and molecular dynamics simulation", + "heading_level": null, + "page_id": 4, + "polygon": [ + [ +<<<<<<< HEAD + 41.54248046875, + 447.40039062499994 + ], + [ + 263.078125, + 447.40039062499994 +======= + 42.330730676651, + 448.152587890625 + ], + [ + 263.7506012916565, + 446.6209299564361 +>>>>>>> 7b3253e3dcdde315039ef0dc00a327f6a7ad50b5 + ], + [ + 262.5865387916565, + 456.4413757324219 + ], + [ +<<<<<<< HEAD + 41.54248046875, + 456.4413757324219 +======= + 41.166668176651, + 457.12694287300104 +>>>>>>> 7b3253e3dcdde315039ef0dc00a327f6a7ad50b5 + ] + ] + }, + { + "title": "2.6. D12 and D15 suppress the growth of GC cell lines", + "heading_level": null, + "page_id": 6, + "polygon": [ + [ +<<<<<<< HEAD + 42.12451171875, +======= + 42.51889419555664, +>>>>>>> 7b3253e3dcdde315039ef0dc00a327f6a7ad50b5 + 639.9525909423828 + ], + [ + 244.26221704483032, + 639.9525909423828 + ], + [ + 243.09815454483032, + 649.9903259277344 + ], + [ +<<<<<<< HEAD + 42.12451171875, + 650.1650390625 +======= + 41.44331419467926, + 651.2933403253555 +>>>>>>> 7b3253e3dcdde315039ef0dc00a327f6a7ad50b5 + ] + ] + }, + { + "title": "2.7. D12 and D15 inhibit the phosphorylation of FGFR1 and the\ndownstream signaling", + "heading_level": null, + "page_id": 6, + "polygon": [ + [ + 311.52764892578125, + 650.4423370361328 + ], + [ +<<<<<<< HEAD + 547.69140625, + 650.4423370361328 + ], + [ + 547.69140625, + 671.1005859375 +======= + 547.6704823970795, + 650.4423370361328 + ], + [ + 547.6704823970795, + 670.9114608764648 +>>>>>>> 7b3253e3dcdde315039ef0dc00a327f6a7ad50b5 + ], + [ + 311.52764892578125, + 671.9939604997635 + ] + ] + }, + { + "title": "2.8. D12 and D15 arrest the cell cycle at G0/G1 phase", + "heading_level": null, + "page_id": 7, + "polygon": [ + [ +<<<<<<< HEAD + 31.5751953125, +======= + 31.600541949272156, +>>>>>>> 7b3253e3dcdde315039ef0dc00a327f6a7ad50b5 + 365.3900146484375 + ], + [ + 232.09129333496094, + 365.3900146484375 + ], + [ + 232.09129333496094, + 375.427734375 + ], + [ +<<<<<<< HEAD + 31.5751953125, +======= + 31.600541949272156, +>>>>>>> 7b3253e3dcdde315039ef0dc00a327f6a7ad50b5 + 375.427734375 + ] + ] + }, + { + "title": "2.9. D12 and D15 induce apoptosis of GC cell lines", + "heading_level": null, + "page_id": 7, + "polygon": [ + [ + 31.930536687374115, + 555.1395874023438 + ], + [ +<<<<<<< HEAD + 220.298828125, + 555.1395874023438 + ], + [ + 220.298828125, +======= + 220.27825951576233, + 555.1395874023438 + ], + [ + 220.27825951576233, +>>>>>>> 7b3253e3dcdde315039ef0dc00a327f6a7ad50b5 + 565.1773223876953 + ], + [ + 31.930536687374115, + 565.760049700737 + ] + ] + }, + { + "title": "3. Conclusion", + "heading_level": null, + "page_id": 7, + "polygon": [ + [ +<<<<<<< HEAD + 301.783203125, + 524.5517578125 + ], + [ + 357.658203125, + 524.5517578125 + ], + [ + 357.658203125, + 533.7709045410156 + ], + [ + 301.783203125, + 533.7709045410156 +======= + 301.83282470703125, + 525.4821166992188 + ], + [ + 357.81181049346924, + 524.0251131057739 + ], + [ + 357.81181049346924, + 533.7709045410156 + ], + [ + 301.83282470703125, + 534.1705958843231 +>>>>>>> 7b3253e3dcdde315039ef0dc00a327f6a7ad50b5 + ] + ] + }, + { + "title": "4. Experimental section", + "heading_level": null, + "page_id": 8, + "polygon": [ + [ + 311.0147430896759, + 638.7971707582474 + ], + [ + 408.8246967792511, + 637.2463895082474 + ], + [ + 408.8246967792511, + 647.6088256835938 + ], + [ + 311.0147430896759, + 647.6595486402512 + ] + ] + }, + { + "title": "4.1. Chemistry", + "heading_level": null, + "page_id": 8, + "polygon": [ + [ + 311.52862548828125, + 660.2394104003906 + ], + [ + 364.71038818359375, + 659.1531623601913 + ], + [ + 364.71038818359375, + 668.5282135009766 + ], + [ + 311.52862548828125, + 669.3121803998947 + ] + ] + }, + { + "title": "4.1.1. General procedure for preparation of the intermediates 3a-3d", + "heading_level": null, + "page_id": 10, + "polygon": [ + [ + 42.27036589384079, + 127.3927001953125 + ], + [ + 287.6087646484375, + 127.3927001953125 + ], + [ + 287.6087646484375, +<<<<<<< HEAD + 137.728759765625 + ], + [ + 42.197265625, + 137.728759765625 +======= + 137.6444720029831 + ], + [ + 42.27036589384079, + 137.6444720029831 +>>>>>>> 7b3253e3dcdde315039ef0dc00a327f6a7ad50b5 + ] + ] + }, + { + "title": "4.1.2. General procedure for synthesis of compounds A1-A7", + "heading_level": null, + "page_id": 10, + "polygon": [ + [ + 42.10457366704941, + 252.9097900390625 + ], + [ + 261.29352283477783, + 252.9097900390625 + ], + [ + 261.29352283477783, + 262.94757080078125 + ], + [ + 42.10457366704941, + 262.94757080078125 + ] + ] + }, + { + "title": "4.1.3. Synthesis of the intermediates 5e and 5b", + "heading_level": null, + "page_id": 10, + "polygon": [ + [ + 42.23431754112244, + 357.5101623535156 + ], + [ + 215.84536743164062, + 357.5101623535156 + ], + [ + 215.84536743164062, + 367.9389897584915 + ], + [ + 42.23431754112244, + 367.9389897584915 + ] + ] + }, + { + "title": "4.1.4. Synthesis of the intermediates 7e and 7b", + "heading_level": null, + "page_id": 10, + "polygon": [ + [ + 42.518890380859375, + 451.6757507324219 + ], + [ + 215.84432983398438, + 451.6757507324219 + ], + [ + 215.84432983398438, + 461.7134704589844 + ], + [ + 42.518890380859375, + 461.7134704589844 + ] + ] + }, + { + "title": "4.1.5. General procedure for synthesis of compounds 8e1-8e14 and\n8b1-8b5", + "heading_level": null, + "page_id": 10, + "polygon": [ + [ + 41.9001042842865, + 566.7053375244141 + ], + [ + 288.5704827308655, + 566.7053375244141 + ], + [ + 288.5704827308655, + 586.8070021867752 + ], + [ + 41.9001042842865, + 586.8070021867752 + ] + ] + }, + { + "title": "4.1.6. General procedure for synthesis of compounds B1eB14", + "heading_level": null, + "page_id": 10, + "polygon": [ + [ + 41.90244001150131, + 681.7930221557617 + ], + [ + 269.2685658931732, + 681.7930221557617 + ], + [ + 269.2685658931732, + 691.8307647705078 + ], + [ + 41.90244001150131, + 691.8331512212753 + ] + ] + }, + { + "title": "4.1.7. General procedure for synthesis of compounds C1eC5", + "heading_level": null, + "page_id": 10, + "polygon": [ + [ + 311.5270080566406, + 451.6757507324219 + ], + [ + 532.6221468448639, + 451.6757507324219 + ], + [ + 532.6221468448639, + 461.7134704589844 + ], + [ + 311.5270080566406, + 461.9815109968185 + ] + ] + }, + { + "title": "4.1.8. General procedure for synthesis of compounds D1-D13", + "heading_level": null, + "page_id": 11, + "polygon": [ + [ + 32.29959046840668, + 702.7132339477539 + ], + [ + 256.4778217077255, + 702.7132339477539 + ], + [ + 256.4778217077255, + 712.7509765625 + ], + [ + 32.29959046840668, + 712.9325883388519 + ] + ] + }, + { + "title": "4.1.9. Preparation of the protected vanillin 10", + "heading_level": null, + "page_id": 11, + "polygon": [ + [ + 301.83319091796875, + 221.50311279296875 + ], + [ + 469.5670166015625, + 221.3112453818321 + ], + [ + 469.5670166015625, + 231.5408935546875 + ], + [ + 301.83319091796875, + 231.5408935546875 + ] + ] + }, + { + "title": "4.1.10. Preparation of the unilateral substituted intermediate 11", + "heading_level": null, + "page_id": 11, + "polygon": [ + [ + 300.4928514957428, + 305.2381591796875 + ], + [ + 535.8410034179688, + 305.2381591796875 + ], + [ + 535.8410034179688, + 315.27587890625 + ], + [ + 300.4928514957428, + 315.78687131404877 + ] + ] + }, + { + "title": "4.1.11. General procedure for synthesis of compounds D14-D15", + "heading_level": null, + "page_id": 11, + "polygon": [ + [ + 301.8357238769531, + 399.3470764160156 + ], + [ + 533.2697205543518, + 399.3470764160156 + ], + [ + 533.2697205543518, + 409.3847961425781 + ], + [ + 301.8357238769531, + 409.4040729999542 + ] + ] + }, + { + "title": "4.2. In vitro kinase assay", + "heading_level": null, + "page_id": 12, + "polygon": [ + [ +<<<<<<< HEAD + 41.7607421875, + 160.1181640625 + ], + [ + 135.38156127929688, + 160.1181640625 +======= + 41.80940169095993, + 160.0301612019539 + ], + [ + 135.38156127929688, + 160.0301612019539 +>>>>>>> 7b3253e3dcdde315039ef0dc00a327f6a7ad50b5 + ], + [ + 135.38156127929688, + 168.7818603515625 + ], + [ +<<<<<<< HEAD + 41.7607421875, +======= + 41.80940169095993, +>>>>>>> 7b3253e3dcdde315039ef0dc00a327f6a7ad50b5 + 168.7818603515625 + ] + ] + }, + { + "title": "Inhibition % ¼ (max conversion)/(max min) 100%,", + "heading_level": null, + "page_id": 12, + "polygon": [ + [ +<<<<<<< HEAD + 41.83349609375, + 410.95703125 +======= + 41.948621690273285, + 411.1345514059067 +>>>>>>> 7b3253e3dcdde315039ef0dc00a327f6a7ad50b5 + ], + [ + 258.52038407325745, + 411.1345514059067 + ], + [ + 258.52038407325745, + 422.720458984375 + ], + [ +<<<<<<< HEAD + 41.83349609375, +======= + 41.948621690273285, +>>>>>>> 7b3253e3dcdde315039ef0dc00a327f6a7ad50b5 + 422.720458984375 + ] + ] + }, + { + "title": "4.3. Quantitative structure-activity relationships analysis", + "heading_level": null, + "page_id": 12, + "polygon": [ + [ +<<<<<<< HEAD + 41.97900390625, + 516.0224609375 +======= + 41.95530915260315, + 516.1073639392853 +>>>>>>> 7b3253e3dcdde315039ef0dc00a327f6a7ad50b5 + ], + [ + 252.52962565422058, + 516.1073639392853 + ], + [ + 252.52962565422058, + 524.4728088378906 + ], + [ +<<<<<<< HEAD + 41.97900390625, +======= + 41.95530915260315, +>>>>>>> 7b3253e3dcdde315039ef0dc00a327f6a7ad50b5 + 524.4728088378906 + ] + ] + }, + { + "title": "4.3.1. Materials and descriptors selection", + "heading_level": null, + "page_id": 12, + "polygon": [ + [ +<<<<<<< HEAD + 41.97900390625, + 536.1826171875 + ], + [ + 192.943359375, + 536.1826171875 + ], + [ + 192.943359375, + 545.3921813964844 + ], + [ + 41.97900390625, +======= + 42.082051217556, + 537.1033935546875 + ], + [ + 193.00542044639587, + 535.5950638055801 + ], + [ + 193.00542044639587, + 545.3921813964844 + ], + [ + 42.082051217556, +>>>>>>> 7b3253e3dcdde315039ef0dc00a327f6a7ad50b5 + 545.3921813964844 + ] + ] + }, + { + "title": "4.3.2. Multiple linear regression (MLR) analysis", + "heading_level": null, + "page_id": 12, + "polygon": [ + [ +<<<<<<< HEAD + 41.97900390625, + 673.0390625 +======= + 42.52143096923828, + 673.1076889038086 +>>>>>>> 7b3253e3dcdde315039ef0dc00a327f6a7ad50b5 + ], + [ + 216.97254943847656, + 672.0730897188187 + ], + [ + 216.97254943847656, + 681.3964920043945 + ], + [ +<<<<<<< HEAD + 41.97900390625, + 681.568359375 +======= + 41.48045492172241, + 682.249807715416 +>>>>>>> 7b3253e3dcdde315039ef0dc00a327f6a7ad50b5 + ] + ] + }, + { + "title": "4.3.3. Validation of the models", + "heading_level": null, + "page_id": 12, + "polygon": [ + [ + 311.53033447265625, +<<<<<<< HEAD + 159.73046875 + ], + [ + 424.9178161621094, + 159.73046875 +======= + 160.36002403497696 + ], + [ + 424.9178161621094, + 158.80924278497696 +>>>>>>> 7b3253e3dcdde315039ef0dc00a327f6a7ad50b5 + ], + [ + 424.9178161621094, + 168.77471923828125 + ], + [ + 311.53033447265625, + 168.77471923828125 + ] + ] + }, + { + "title": "4.4. ATP competitive inhibition assay", + "heading_level": null, + "page_id": 12, + "polygon": [ + [ + 311.4140725135803, + 265.09136962890625 + ], + [ + 448.2675094604492, + 264.23603081703186 + ], + [ + 447.985107421875, + 273.38018798828125 + ], + [ + 310.2500100135803, + 273.99372363090515 + ] + ] + }, + { + "title": "4.5. Molecular docking", + "heading_level": null, + "page_id": 12, + "polygon": [ + [ + 311.5272216796875, + 369.74688720703125 + ], + [ + 397.6161901950836, + 369.35276198387146 + ], + [ + 396.4521276950836, + 378.0356750488281 + ], + [ + 310.4133155345917, + 379.0003743171692 + ] + ] + }, + { + "title": "4.5.1. Molecular dynamics (MD) simulations", + "heading_level": null, + "page_id": 12, + "polygon": [ + [ + 311.5289306640625, + 610.29150390625 + ], + [ + 476.2380826473236, + 609.5897336006165 + ], + [ + 476.2380826473236, + 618.5802917480469 + ], + [ + 311.5289306640625, + 619.3566076755524 + ] + ] + }, + { + "title": "", + "heading_level": null, + "page_id": 13, + "polygon": [ + [ +<<<<<<< HEAD + 32.4482421875, + 110.78393554687499 + ], + [ + 158.1669921875, + 110.78393554687499 + ], + [ + 158.1669921875, + 119.31323242187499 + ], + [ + 32.4482421875, + 119.31323242187499 +======= + 32.35826340317726, + 108.22186279296875 + ], + [ + 157.82077169418335, + 108.22186279296875 + ], + [ + 157.82077169418335, + 119.36888790130614 + ], + [ + 32.35826340317726, + 119.36888790130614 +>>>>>>> 7b3253e3dcdde315039ef0dc00a327f6a7ad50b5 + ] + ] + }, + { + "title": "4.5.2. Binding free energy calculations and decomposition analysis", + "heading_level": null, + "page_id": 13, + "polygon": [ + [ +<<<<<<< HEAD + 31.8662109375, +======= + 31.927366137504578, +>>>>>>> 7b3253e3dcdde315039ef0dc00a327f6a7ad50b5 + 328.9812927246094 + ], + [ + 277.6880187988281, + 328.9812927246094 + ], + [ + 277.6880187988281, +<<<<<<< HEAD + 337.6826171875 + ], + [ + 31.8662109375, + 337.6826171875 +======= + 337.94953632354736 + ], + [ + 31.927366137504578, + 337.94953632354736 +>>>>>>> 7b3253e3dcdde315039ef0dc00a327f6a7ad50b5 + ] + ] + }, + { + "title": "4.6. Biological activity experiments", + "heading_level": null, + "page_id": 13, + "polygon": [ + [ +<<<<<<< HEAD + 32.7392578125, + 566.810546875 + ], + [ + 162.08982849121094, + 566.810546875 +======= + 32.82513427734375, + 566.6887768507004 + ], + [ + 162.08982849121094, + 566.6887768507004 +>>>>>>> 7b3253e3dcdde315039ef0dc00a327f6a7ad50b5 + ], + [ + 162.08982849121094, + 575.6666564941406 + ], + [ +<<<<<<< HEAD + 32.7392578125, +======= + 32.82513427734375, +>>>>>>> 7b3253e3dcdde315039ef0dc00a327f6a7ad50b5 + 575.6666564941406 + ] + ] + }, + { + "title": "4.6.1. Cell culture", + "heading_level": null, + "page_id": 13, + "polygon": [ + [ + 32.725292295217514, + 588.2972412109375 + ], + [ + 97.28547668457031, + 586.9405564069748 + ], + [ + 97.28547668457031, + 596.5860290527344 + ], + [ + 32.725292295217514, + 596.7441555261612 + ] + ] + }, + { + "title": "4.6.2. MTT cytotoxicity assay", + "heading_level": null, + "page_id": 13, + "polygon": [ + [ + 32.82460021972656, + 683.2030024528503 + ], + [ +<<<<<<< HEAD + 140.26953125, + 682.34375 + ], + [ + 140.26953125, +======= + 140.9175756573677, + 681.6522212028503 + ], + [ + 139.7535131573677, +>>>>>>> 7b3253e3dcdde315039ef0dc00a327f6a7ad50b5 + 691.8309936523438 + ], + [ + 32.02462297677994, + 691.8309936523438 + ] + ] + }, + { + "title": "sulfoxide (sigma) and quantified at 490 nm.", + "heading_level": null, + "page_id": 13, + "polygon": [ + [ + 301.8329162597656, + 64.63177490234375 + ], + [ + 468.15350341796875, + 64.63177490234375 + ], + [ + 468.15350341796875, + 74.6695556640625 + ], + [ + 301.8329162597656, + 75.737648203969 + ] + ] + }, + { + "title": "4.6.3. Western blot analysis", + "heading_level": null, + "page_id": 13, + "polygon": [ + [ +<<<<<<< HEAD + 301.4921875, + 86.940673828125 + ], + [ + 406.83984375, + 86.940673828125 +======= + 301.43293285369873, + 87.30010986328125 + ], + [ + 406.86378717422485, + 86.25338071584702 +>>>>>>> 7b3253e3dcdde315039ef0dc00a327f6a7ad50b5 + ], + [ + 406.86378717422485, + 95.58892822265625 + ], + [ + 301.43293285369873, + 96.0571928024292 + ] + ] + }, + { + "title": "4.6.4. Hoechst 33342 staining", + "heading_level": null, + "page_id": 13, + "polygon": [ + [ + 301.8329162597656, +<<<<<<< HEAD + 296.00537109375 + ], + [ + 413.2421875, + 296.00537109375 +======= + 296.4940185546875 + ], + [ + 413.95061445236206, + 295.04763305187225 +>>>>>>> 7b3253e3dcdde315039ef0dc00a327f6a7ad50b5 + ], + [ + 412.78655195236206, + 304.7828063964844 + ], + [ + 301.37268352508545, + 305.11569035053253 + ] + ] + }, + { + "title": "4.6.5. Cell cycle analysis", + "heading_level": null, + "page_id": 13, + "polygon": [ + [ + 301.83404541015625, + 390.0653556585312 + ], + [ + 391.6457214355469, + 390.0653556585312 + ], + [ + 391.6457214355469, + 398.9544677734375 + ], + [ + 301.15008783340454, + 398.9544677734375 + ] + ] + }, + { + "title": "4.6.6. Statistical analysis", + "heading_level": null, + "page_id": 13, + "polygon": [ + [ +<<<<<<< HEAD + 301.4921875, + 483.84374999999994 + ], + [ + 394.03515625, + 483.84374999999994 +======= + 301.8325500488281, + 484.06263315677637 + ], + [ + 394.0697569847107, + 484.06263315677637 +>>>>>>> 7b3253e3dcdde315039ef0dc00a327f6a7ad50b5 + ], + [ + 394.0697569847107, + 493.0648193359375 + ], + [ + 301.8325500488281, + 493.0648193359375 + ] + ] + }, + { + "title": "Conflict of interest", + "heading_level": null, + "page_id": 13, + "polygon": [ + [ + 301.41495752334595, + 556.2740020751953 + ], + [ + 377.6795949935913, + 556.2740020751953 + ], + [ + 377.6795949935913, + 566.7362449169159 + ], + [ + 301.41495752334595, + 566.7362449169159 + ] + ] + }, + { + "title": "Acknowledgment", + "heading_level": null, + "page_id": 13, + "polygon": [ + [ + 301.53659296035767, + 599.7410327196121 + ], + [ + 371.7338104248047, + 599.7410327196121 + ], + [ + 371.7338104248047, + 608.1505126953125 + ], + [ + 301.53659296035767, + 608.1505126953125 + ] + ] + }, + { + "title": "Appendix A. Supplementary data", + "heading_level": null, + "page_id": 13, + "polygon": [ + [ + 301.7757430076599, + 683.5400619506836 + ], + [ + 436.25146293640137, + 682.106788277626 + ], + [ + 436.25146293640137, + 691.8288650512695 + ], + [ + 301.7757430076599, + 692.0412913560867 + ] + ] + }, + { + "title": "References", + "heading_level": null, + "page_id": 14, + "polygon": [ + [ + 42.08900511264801, + 66.38311767578125 + ], + [ + 85.58034354448318, + 65.74026058614254 + ], + [ + 85.58034354448318, + 74.67193603515625 + ], + [ + 42.08900511264801, + 74.97251550853252 + ] + ] + } + ], + "page_stats": [ + { + "page_id": 0, + "text_extraction_method": "pdftext", + "block_counts": [ + [ + "Span", + 301 + ], + [ + "Line", + 86 + ], + [ + "Text", + 17 + ], + [ + "SectionHeader", + 5 + ], + [ + "Molecule", + 4 +<<<<<<< HEAD + ], + [ + "Footnote", + 3 +======= +>>>>>>> 7b3253e3dcdde315039ef0dc00a327f6a7ad50b5 + ], + [ + "Picture", + 3 + ], + [ + "Footnote", + 3 + ], + [ + "PageHeader", + 1 + ] + ], + "block_metadata": { + "llm_request_count": 0, + "llm_error_count": 0, + "llm_tokens_used": 0 + }, + "page_header": "European Journal of Medicinal Chemistry 127 (2017) 885e899\n", + "page_footer": "" + }, + { + "page_id": 1, + "text_extraction_method": "pdftext", + "block_counts": [ + [ + "Span", + 158 + ], + [ + "Line", + 50 + ], + [ + "Text", + 5 + ], + [ + "Molecule", + 4 + ], + [ + "PageHeader", + 2 + ], + [ + "Caption", + 2 + ], + [ + "Figure", + 1 + ], + [ + "FigureGroup", + 1 + ] + ], + "block_metadata": { + "llm_request_count": 0, + "llm_error_count": 0, + "llm_tokens_used": 0 + }, + "page_header": "", + "page_footer": "" + }, + { + "page_id": 2, + "text_extraction_method": "pdftext", + "block_counts": [ + [ + "Span", + 484 + ], + [ + "Line", + 124 + ], + [ + "Text", + 8 + ], + [ + "SectionHeader", + 5 + ], + [ + "Molecule", + 4 + ], + [ + "PageHeader", + 2 + ] + ], + "block_metadata": { + "llm_request_count": 0, + "llm_error_count": 0, + "llm_tokens_used": 0 + }, + "page_header": "S. Ying et al. / European Journal of Medicinal Chemistry 127 (2017) 885e899 887\n", + "page_footer": "" + }, + { + "page_id": 3, + "text_extraction_method": "pdftext", + "block_counts": [ + [ + "Span", + 34 + ], + [ + "Line", + 12 + ], + [ + "Text", + 3 + ], + [ + "PageHeader", + 2 + ], + [ + "Molecule", + 2 + ], + [ + "Figure", + 1 + ], + [ + "Reference", + 1 + ] + ], + "block_metadata": { + "llm_request_count": 0, + "llm_error_count": 0, + "llm_tokens_used": 0 + }, + "page_header": "888 S. Ying et al. / European Journal of Medicinal Chemistry 127 (2017) 885e899\n", + "page_footer": "" + }, + { + "page_id": 4, + "text_extraction_method": "pdftext", + "block_counts": [ + [ + "Span", + 413 + ], + [ + "Line", + 103 + ], + [ + "Text", + 5 + ], + [ + "Caption", + 3 + ], + [ + "PageHeader", + 2 + ], + [ + "Figure", + 2 + ], + [ + "Molecule", + 2 + ], + [ + "FigureGroup", + 2 + ], + [ + "SectionHeader", + 1 + ], + [ + "Reference", + 1 + ] + ], + "block_metadata": { + "llm_request_count": 0, + "llm_error_count": 0, + "llm_tokens_used": 0 + }, + "page_header": "S. Ying et al. / European Journal of Medicinal Chemistry 127 (2017) 885e899 889\n", + "page_footer": "" + }, + { + "page_id": 5, + "text_extraction_method": "pdftext", + "block_counts": [ + [ + "Span", + 216 + ], + [ + "Line", + 209 + ], + [ + "Molecule", + 4 + ], + [ + "PageHeader", + 2 + ], + [ + "Figure", + 1 + ], + [ + "Caption", + 1 + ], + [ + "FigureGroup", + 1 + ], + [ + "Reference", + 1 + ] + ], + "block_metadata": { + "llm_request_count": 0, + "llm_error_count": 0, + "llm_tokens_used": 0 + }, + "page_header": "S. Ying et al. / European Journal of Medicinal Chemistry 127 (2017) 885\ne\n899 890\n", + "page_footer": "" + }, + { + "page_id": 6, + "text_extraction_method": "pdftext", + "block_counts": [ + [ + "Span", + 173 + ], + [ + "Line", + 35 + ], + [ + "Text", + 4 + ], + [ + "Molecule", + 4 + ], + [ + "PageHeader", + 2 + ], + [ + "SectionHeader", + 2 + ], + [ + "Figure", + 1 + ], + [ + "Caption", + 1 + ], + [ + "Reference", + 1 + ] + ], + "block_metadata": { + "llm_request_count": 0, + "llm_error_count": 0, + "llm_tokens_used": 0 + }, + "page_header": "S. Ying et al. / European Journal of Medicinal Chemistry 127 (2017) 885e899 891\n", + "page_footer": "" + }, + { + "page_id": 7, + "text_extraction_method": "pdftext", + "block_counts": [ + [ + "Span", + 322 + ], + [ + "Line", + 54 + ], + [ + "Text", + 5 + ], + [ + "SectionHeader", +<<<<<<< HEAD + 3 + ], + [ + "Molecule", +======= +>>>>>>> 7b3253e3dcdde315039ef0dc00a327f6a7ad50b5 + 3 + ], + [ + "PageHeader", + 2 + ], + [ + "Molecule", + 2 + ], + [ + "Table", + 1 + ], + [ + "TextInlineMath", + 1 + ], + [ + "Figure", + 1 + ], + [ + "Caption", + 1 + ], + [ + "MoleculeTable", + 1 + ], + [ + "FigureGroup", + 1 + ], + [ + "Reference", + 1 + ] + ], + "block_metadata": { + "llm_request_count": 0, + "llm_error_count": 0, + "llm_tokens_used": 0 + }, + "page_header": "", + "page_footer": "" + }, + { + "page_id": 8, + "text_extraction_method": "pdftext", + "block_counts": [ + [ + "Span", + 111 + ], + [ + "Line", + 22 + ], + [ + "PageHeader", + 2 + ], + [ + "Text", + 2 + ], + [ + "SectionHeader", + 2 + ], + [ + "Molecule", + 2 + ], + [ + "Figure", + 1 + ], + [ + "Caption", + 1 + ], + [ + "FigureGroup", + 1 + ], + [ + "Reference", + 1 + ] + ], + "block_metadata": { + "llm_request_count": 0, + "llm_error_count": 0, + "llm_tokens_used": 0 + }, + "page_header": "S. Ying et al. / European Journal of Medicinal Chemistry 127 (2017) 885e899 893\n", + "page_footer": "" + }, + { + "page_id": 9, + "text_extraction_method": "pdftext", + "block_counts": [ + [ + "Span", + 115 + ], + [ + "Line", + 14 + ], + [ + "Molecule", + 4 + ], + [ + "PageHeader", + 2 + ], + [ + "Text", + 2 + ], + [ + "Figure", + 1 + ], + [ + "Caption", + 1 + ], + [ + "FigureGroup", + 1 + ], + [ + "Reference", + 1 + ] + ], + "block_metadata": { + "llm_request_count": 0, + "llm_error_count": 0, + "llm_tokens_used": 0 + }, + "page_header": "", + "page_footer": "" + }, + { + "page_id": 10, + "text_extraction_method": "pdftext", + "block_counts": [ + [ + "Span", + 828 + ], + [ + "Line", + 143 + ], + [ + "Text", + 8 + ], + [ + "SectionHeader", + 7 + ], + [ + "TextInlineMath", + 5 + ], + [ + "Molecule", + 3 + ], + [ + "PageHeader", + 2 + ] + ], + "block_metadata": { + "llm_request_count": 0, + "llm_error_count": 0, + "llm_tokens_used": 0 + }, + "page_header": "S. Ying et al. / European Journal of Medicinal Chemistry 127 (2017) 885e899 895\n", + "page_footer": "" + }, + { + "page_id": 11, + "text_extraction_method": "pdftext", + "block_counts": [ + [ + "Span", + 1054 + ], + [ + "Line", + 156 + ], + [ + "TextInlineMath", + 8 + ], + [ + "SectionHeader", + 4 + ], + [ + "Text", + 3 + ], + [ + "Molecule", + 3 + ], + [ + "PageHeader", + 2 + ] + ], + "block_metadata": { + "llm_request_count": 0, + "llm_error_count": 0, + "llm_tokens_used": 0 + }, + "page_header": "", + "page_footer": "" + }, + { + "page_id": 12, + "text_extraction_method": "pdftext", + "block_counts": [ + [ + "Span", + 430 + ], + [ + "Line", + 123 + ], + [ + "Text", + 10 + ], + [ + "SectionHeader", + 9 + ], + [ + "Molecule", + 3 + ], + [ + "PageHeader", + 2 + ], + [ + "TextInlineMath", + 2 + ] + ], + "block_metadata": { + "llm_request_count": 0, + "llm_error_count": 0, + "llm_tokens_used": 0 + }, + "page_header": "S. Ying et al. / European Journal of Medicinal Chemistry 127 (2017) 885e899 897\n", + "page_footer": "" + }, + { + "page_id": 13, + "text_extraction_method": "pdftext", + "block_counts": [ + [ + "Span", + 404 + ], + [ + "Line", + 116 + ], + [ + "SectionHeader", + 13 + ], + [ + "Text", + 10 + ], + [ + "Equation", + 4 + ], + [ + "TextInlineMath", +<<<<<<< HEAD +======= + 3 + ], + [ + "Molecule", +>>>>>>> 7b3253e3dcdde315039ef0dc00a327f6a7ad50b5 + 3 + ], + [ + "PageHeader", + 2 + ], + [ + "Molecule", + 2 + ] + ], + "block_metadata": { + "llm_request_count": 0, + "llm_error_count": 0, + "llm_tokens_used": 0 + }, + "page_header": "898 S. Ying et al. / European Journal of Medicinal Chemistry 127 (2017) 885e899\n", + "page_footer": "" + }, + { + "page_id": 14, + "text_extraction_method": "pdftext", + "block_counts": [ + [ + "Span", + 524 + ], + [ + "Line", + 141 + ], + [ + "ListItem", + 46 + ], + [ + "Molecule", + 3 + ], + [ + "PageHeader", + 2 + ], + [ + "ListGroup", + 2 + ], + [ + "SectionHeader", + 1 + ], + [ + "Text", + 1 + ], + [ + "Reference", + 1 + ] + ], + "block_metadata": { + "llm_request_count": 0, + "llm_error_count": 0, + "llm_tokens_used": 0 + }, + "page_header": "S. Ying et al. / European Journal of Medicinal Chemistry 127 (2017) 885e899 899\n", + "page_footer": "" + } + ] +} \ No newline at end of file diff --git a/molecule_test_output/chemistry_test_with_molecules.md b/molecule_test_output/chemistry_test_with_molecules.md new file mode 100644 index 00000000..68c3b496 --- /dev/null +++ b/molecule_test_output/chemistry_test_with_molecules.md @@ -0,0 +1,486 @@ +![](_page_0_Picture_1.jpeg) + +Contents lists available at [ScienceDirect](www.sciencedirect.com/science/journal/02235234) + +# European Journal of Medicinal Chemistry + +Research paper + +# Synthesis, biological evaluation, QSAR and molecular dynamics simulation studies of potential fibroblast growth factor receptor 1 inhibitors for the treatment of gastric cancer + +![](_page_0_Picture_7.jpeg) + +Shilong Ying a, 1 , Xiaojing Du a, b, 1 , Weitao Fu a, 1 , Di Yun a , Liping Chen a , Yuepiao Cai a , Qing Xu c , Jianzhang Wu a, \* , Wulan Li a, d, \*\*, Guang Liang a + +a Chemical Biology Research Center, School of Pharmaceutical Sciences, Wenzhou Medical Universtiy, Wenzhou, Zhejiang 325035, China + +b Department of Digestive Diseases, The First Affiliated Hospital of Wenzhou Medical University, Wenzhou, Zhejiang 323000, China + +c College of Chemistry and Materials Engineering, Wenzhou University, Wenzhou, Zhejiang 325035, China + +d College of Information Science and Computer Engineering, Wenzhou Medical University, Wenzhou, Zhejiang 325035, China + +# article info + +Article history: Received 27 August 2016 Received in revised form 12 October 2016 Accepted 31 October 2016 Available online 1 November 2016 + +Keywords: Design Fibroblast growth factor receptor 1 Gastric cancer Quantitative structure-activity relationship + +# abstract + +Accumulating evidence suggests that fibroblast growth factor receptor 1 (FGFR1) is an attractive target in gastric cancer therapy. Based on our previous discovery of two non-ATP competitive FGFR1 inhibitors, A114 and A117, we designed and screened a series of compounds with the framework of bisaryl-1,4-dien-3-one. Among them, D12 and D15 exhibited the most potent FGFR1 inhibitory activity, which was ATPindependent. Furthermore, a quantitative structure-activity relationship analysis of 41 analogs demonstrated that the specific structural substitutions alter their bioactivities. Molecular docking and dynamics simulation analysis indicated the hydrophobic interaction at the FGFR1-D12/D15 interaction was dominant. Evaluation for anti-gastric cancer efficacy of D12 and D15 indicated effective inhibition of cell proliferation, apoptosis induction and cell cycle arrest. Thus, these two FGFR1 inhibitors have therapeutic potential in the treatment of gastric cancer, and this study provides will contribute to the rational design of novel non-ATP competitive FGFR1 inhibitors. + +© 2016 Elsevier Masson SAS. All rights reserved. + +#### 1. Introduction + +Gastric cancer (GC) is the fifth most common malignancy and the third leading cause of cancer-related death worldwide [\[1,2\].](#page-14-0) Although surgery procedures and chemotherapy have improved greatly in recent years, the clinical outcomes with treatment of gastric cancer remain poor due to absence of early detection of the cancer [\[3\].](#page-14-0) For advanced gastric cancer patients, a combination of surgery and traditional chemotherapy is still the mainstream remedy [\[4,5\].](#page-14-0) However, the severe and irreversible side-effects of chemotherapy limit their therapeutic use [\[6,7\]](#page-14-0). Thus, development of newly targeted therapy for gastric cancer with minimal sideeffects is highly desirable. At present, the only small molecule + +1 These authors contribute to this work equally. + + 0223-5234/© 2016 Elsevier Masson SAS. All rights reserved. inhibitor for GC therapy available on the market is Apatnib, an inhibitor of vascular endothelial growth factor receptor (VEGFR) which was approved by the China Food and Drug Administration (CFDA) in 2014 [\[8,9\]](#page-14-0). In recent years, an increasing number of signaling molecules are considered as novel targets against cancer [\[10\]](#page-14-0). Of these, Fibroblast growth factor receptor 1 (FGFR1) is considered to be an important potential target for treating GC [\[3,11,12\].](#page-14-0) + +FGFR1, belonging to the superfamily of receptor tyrosine kinases (RTK), has been intensively studied as a drug target for cancer [\[13,14\].](#page-14-0) The binding of fibroblast growth factor (FGF) and FGFR triggers the dimerization of the extracellular receptor domains, trans-phosphorylation of the intracellular kinase domain, and the subsequent activation of downstream signaling pathways, the mitogen activated protein kinase (MAPK) and phosphoinositide-3 kinase (PI3K)/Akt [\[15,16\]](#page-14-0). Increased activity of FGFR1 is closely linked with malignant tumors, such as lung cancer [\[17\],](#page-14-0) gastric adenocarcinoma [\[12\],](#page-14-0) renal cell carcinoma [\[18\]](#page-14-0), and breast cancer [\[19\]](#page-14-0). Aberrant FGFR1 activity as a result of mutations, gene amplifications, or chromosomal translocations are reported as a key + +c1ccccc1 + +c1ccccc1 + +c1ccccc1 + +c1ccccc1 + +\* Corresponding author. + +\*\* Corresponding author. Chemical Biology Research Center, School of Pharmaceutical Sciences, Wenzhou Medical Universtiy, Wenzhou, Zhejiang 325035, China. + +E-mail addresses: [wjzwzmu@163.com](mailto:wjzwzmu@163.com) (J. Wu), [lwlwzmu@163.com](mailto:lwlwzmu@163.com) (W. Li). + +driver of tumor progression by promoting proliferation, angiogenesis, survival, migration and invasion of cancer cells [\[20,21\].](#page-14-0) Recently, Oki et al. found that FGFR1 expression is upregulated in GC tissue specimens [\[22\],](#page-14-0) as are several GC cell lines (MKN-74, MKN-45, BGC-823, MGC-803, and SGC-7901) [\[23\].](#page-14-0) + +At present, several small molecule inhibitors of FGFR1 are being evaluated in pre-clinical or clinical trials as anti-gastric cancer agents [\[13,24,25\]](#page-14-0). In one study, NVP-BGJ398, a pan-FGFR inhibitor showed strong inhibition of proliferation of KKLS cells which have high FGFR1 expression, and built with low inhibition on TMK-1 cells expressing low levels of FGFR1 [\[25\].](#page-14-0) Another potent FGFR1 inhibitor, AZD4547, has been completed a phase 2 clinical trial assessing its efficacy and safety in advanced gastric or gastrooesophageal junction cancer patients (NCT01795768). However, the development of current FGFR inhibitors as anti-cancer drugs remains limited. Most of these inhibitors belong to ATPcompetitive inhibitors, which bind to and inhibit the relatively conserved ATP binding domain of RTKs. Such inhibition of FGFR often results in significant side-effects and toxicity in clinical and pre-clinical studies [\[26](#page-14-0)e[28\].](#page-14-0) Thus, the development of non-ATPcompetitive FGFR inhibitors can be a more selective approach to overcome the above difficulties. + +Previously, we identified two bisaryl-1,4-dien-3-one-containing non-ATP competitive FGFR1 inhibitors, A114 and A117, and characterized their therapeutic potency in non-small cell lung cancer in vitro and in vivo studies [\[29\]](#page-14-0). To identify FGFR1 inhibitors with greater efficacy for gastric cancer, we analyze the structureactivity relationship of analogs of bisaryl-1,4-dien-3-one-containing FGFR1 inhibitors. We developed four series of analogs by a rational modified strategy based on the inhibitory activity screening results (Fig. 1). We also established a QSAR model to identify the pharmacophore. Firstly, we altered the cyclopentanone ring in the structure of A114 and A117 into various ketones (the A series compounds) to investigate whether the middle-linking chain was essential. Moreover, further detailed modifications were made on optimization of one of the phenyl substituents to develop the B series compounds. The latter was the unilateral 4-nitrogen atomcontaining moiety, which significantly enhanced the activity of the compound. To confirm the effect of this interesting active fragment, the C series compounds, characterized by a common para-position nitrogen atom-containing heterocyclic group, was produced. By removing the propionyl or isobutyryl fragment to produce an exposed hydroxyl, the D series compound was designed, which was expected to effectively elevate the activity by forming hydrogen bond donors and acceptor. + +In this study, we successfully screened and found two active FGFR1 inhibitors, D12 and D15, by kinase inhibition assay and confirmed their non-ATP competitive inhibitory characteristic. + +![](_page_1_Figure_7.jpeg) + +Fig. 1. Chemical structure of A114, A117 and design strategy of four series derivatives. + +c1ccccc1 + +c1ccccc1 + +c1ccccc1 + +c1ccccc1 + +Biological assessment indicated effective suppression of GC proliferation and induction of apoptosis. This finding constructs an appropriate QSAR model of non-ATP competitive FGFR1 inhibitors for the first time, and provides two potential targeted therapeutic agents against the GC. + +#### 2. Results and discussion + +#### 2.1. Chemistry + +The synthetic routes of the 41 derivatives of A114 and A117 were shown as [Scheme 1.](#page-3-0) The synthesis of A1-A7 started from corresponding ketones 1 (1a: tetrahydro-4H-pyran-4-one, 1b: cyclohexanone, 1c: acetone, 1d: 4-Piperidone hydrochlorides hydrate). The intermediates 3a-3g were obtained through a step of the classic Claisen-Schmidt condensation reaction, which applied HCl gas as catalyst, and then reacted with propionyl or isobutyryl chloride under Et3N catalyzing to prepare the compounds A1-A7. The procedures to prepare B1-B14, C1-C5 and D1-D13 were similar, in which cyclopentanone 1e or cyclohexanone 1b was activated with morpholine to generate the enamines 5e or 5b. The ethanol solution of 5e or 5b with vanillin was reacted under 78 C, adjusted the pH to acidic conditions to obtain the key intermediates 7e or 7b. Next, the solutions were condensed with different substitutional aromatic aldehyde for preparing the desired D1-D13, 8e1-8e14 and 8b1-8b5. Further, acylation of 8e1-8e14 and 8b1-8b5 with propionyl or isobutyruyl in the presence of Et3N produced the B1eB14 or C1eC5. The general process for synthesizing D14-D15 was by a three-step reaction. Protection of Vanillin with 3,4-dihydro-2Hpyran (THP) yielded the corresponding benzaldehyde 10. Through a unilateral aldol condensation between 10 with acetone in a catalyst of 20% NaOH achieved the conversion to unilateral unsaturated ketone 11. Finally, D14-D15 was obtained by Claisen-Schmidt condensation between 11 and corresponding benzaldehydes in the presence of 40% NaOH and the deprotection of THP in an acidic environment. All compounds were purified by column chromatography. The structures of A-D series compounds were shown in [Scheme 1,](#page-3-0) and confirmed by spectral analysis by high-resolution mass spectrometry (HRMS), nuclear magnetic resonance (1 H-NMR and 13C-NMR) spectroscopy. + +#### 2.2. The inhibitory activity of synthetic compounds on FGFR1 kinase + +The synthesized bisaryl-1,4-dien-3-one-containing compounds were initially screened for inhibition of FGFR kinase activity at a concentration of 20 mM by a Mobility Shift Assay. As shown as in the [Scheme 1,](#page-3-0) compound B14 exhibited the most potent inhibitory activity of FGFR1 kinase, and approximately half of the compounds (18/41) exhibited inhibitory ratio greater than 50%. These compounds were chosen for the further determination of half-maximal inhibitory concentration (IC50). + +Based on the inhibitory potency of all compounds (at 20 mM), a primary structure-activity relationship (SAR) was analyzed. For the A series compounds, the middle aliphatic ketone linker exerted a crucial influence on the inhibitory activity of compounds, changing the middle ketone into either pyrone or piperidone, abolished the potent action of A114 and A117. Based on our previous molecular docking study [\[29\]](#page-14-0), the middle connecting ketone group of A114 and A117 was likely sandwiched in a hydrophobic cleft of FGFR1 kinase domain. An aliphatic ketone linker can greatly stabilize the inactive conformation of kinase domain, as well as provide the selective inhibition on FGFR1. + +It's noteworthy that compound (B1), which adopted a cross position exchange between the propionyloxyl and methoxyl group, or a replacement of the propionyloxyl group with a methoxyl group (B2), resulted in a complete activity loss, which demonstrating that the inhibitory effect of A114 and A117 was closely related to the substituents on their benzene rings. Whether introducing electrondonating (methoxyl, ethyoxyl) or electron-withdrawing substituents (F atom) in one benzene ring of A114 and A117, either contributed little to activity, other than with the 4-nitrogen atomcontaining substituents. Significantly, results indicated that compounds bearing 4-pyrrolidyl or 4-morpholinyl (B7, B13 and B14) exhibited outstanding potency among the B series compounds, providing an attractive active fragment for the design of novel structural FGFR1 inhibitors based on the bisaryl-1,4-dien-3-one scaffold. The screening for the asymmetric derivatives (B, C, and D series) revealed that introducing the 4-nitrogen atom-containing substituents could be useful to enhance the inhibition efficacy. The C series compounds displayed a high level of inhibitory activity, with inhibitory ratio of IR % and a range of 84.3%e94.0% at a compound concentration of 20 mM. Moreover, compounds with the 4-nitrogen atom-containing group on the unilateral benzene ring or a 3-substituted indole moiety effectively suppressed FGFR1 kinase activity (D9-D15, IR %: 50.7%e87.1%, 20 mM). + +# 2.3. Quantitative evaluation of structure-activity relationship (QSAR) + +A QSAR model was constructed to further investigate the SAR of the derivatives of A114 and A117. Small molecules can be represented as molecular descriptors including constitutional descriptors, topological descriptors, auto-correlation descriptors, charge descriptors, molecular properties, which capture and magnifies distinct aspects of chemical structures. Scatter plot of the predicted activity versus experimental values was displayed in [Fig. 2](#page-4-0). Relative high regression coefficient was obtained from the ChemoPy descriptor calculation program and R program. It had a high adjusted squared regression coefficient (R2 adj ¼ 0.8196) and described more than 83.31% of variances in the experimental activity. Equation listed in the figure showed that MoRSEC27 (weighted by atomic charge) and bcute5 (weighted by atomic Sanderson electronegativities) created the major influence in the model. The inhibitory activities of molecules were negatively regulated by MoRSEC27, and positively regulated by bcute5, while S35 (Sum of E-State of atom type: dO) showed little relevance. Consequently, the electronegativity may exert a critical influence on the FGFR1 inhibitory activity of compounds. The combined SAR and QSAR results may provide useful information for the future development of novel FGFR1 kinase inhibitors. + +# 2.4. D12 and D15 inhibit FGFR1 through an ATP-independent mechanism + +In our early research, we identified the non-ATP competitive inhibitory mode of A114 and A117. Thus, the effect of ATP concentration on the potency of inhibition of FGFR1 activation by D12 and D15 was evaluated by a Caliper Mobility Shift Assay to determine whether the inhibitory mechanism was the same as with the parent compound. With increasing ATP concentration, the rate of phosphorylation of FGFR1 substrate gradually increased, reaching a plateau with ATP at 1048 mM. In the presence of D12 up to 20 mM, most of phosphorylation was abolished, and further increase in ATP concentration, up to 4192 mM, had no effect on the inhibitory potency of D12 [\(Fig. 3\)](#page-4-0), The finding supported a likely non-ATP competitive inhibition mechanism. We previously reported that the inhibition effect of PD173074, an ATP-competitive FGFR1 inhibitor, markedly reduced with increasing ATP concentration, indicating competition between ATP with the inhibitor [\[29\]](#page-14-0). Similarly, we evaluated D15 using the ATP-competitive assay and found + +c1ccccc1 + +c1ccccc1 + +c1ccccc1 + +c1ccccc1 + +![](_page_3_Figure_1.jpeg) + +**Reagents and conditions:** (a) HCl gas, EtOH, 40%–50%; (b) THF, propionyl, or isobutyryl chloride, Et3N, rt, 8%–31%; (c) p-TSA, cyclohexane, reflux, 50%; (d) EtOH, 90 °C, 30%; (e) 10% HCl solution, rt, 50%; (f1, f2) differently aldehydes, HCl gas, EtOH, 50–70 °C, 10%–70%; (g1, g2) THF, propionyl or isobutyryl chloride, Et3N, rt, 5%–54%; (h) PPTS, CH2Cl2, rt, 50%; (i) 20% NaOH solution, EtOH, rt, 50%; (j) substituted aldehydes, NaOH/dioxane suspension, 10%–70%; (k) 10% HCl solution, rt, 50%; + +**Abbreviation:** HCl, hydrochloric acid; EtOH, ethanol; THF, tetrahydrofuran; Et3N, triethylamine; rt, room temperature; p-TSA,para-toluenesulfonic acid; IR%, inhibitory ratio at a concentration of 20μM. + +Scheme 1. General synthetic route for the target compounds. + +c1ccccc1 + +c1ccccc1 + +![](_page_4_Figure_1.jpeg) + +Fig. 2. Plots of predicted activity against the corresponding experimental activity on FGFR1 inhibition. + +Abbreviation: MoRSEC27, 3D-MoRSE - signal 27/weighted by atomic charge; S35, Sum of E-State of atom type: dO; bcute5, Highest eigenvaluen.5 of Burden matrix/weighted by atomic Sanderson electronegativities; N, the number of compounds taken into account in the regression; R2 , the multiple correlation coefficient; Radj 2 , adjusted multiple correlation coefficient; s, residual standard error; and the F value is related to the F-statistic analysis (Fischer test). The numbers in parentheses mean the standard deviation of the coefficients. p means the significance of the variables in the model. + +ATP-independent mechanism of inhibition. Based on these results, we conclude that the mechanism of action of D12 and D15 was in an ATP-independent, and consistent with their leads A114 and A117. + +#### 2.5. Molecular docking and molecular dynamics simulation + +We investigated interactions between the compounds and the kinase domain of FGFR1 by molecular docking analysis of the active compound D12/D15. We docked D12/D15 into the inactive conformation of FGFR1 kinase using our well-established molecular docking method which is based on the FGFR1-ARQ069 complex structure for reference (PDB Code: 3RHX) [\[28\].](#page-14-0) From an overall determination of root mean square deviations (RMSD) and estimated free energy, three poses of docking results were chosen as initial structures for molecular dynamics (MD) simulation. For D12, 93% poses with RMSD less than 2 Å, and the top ranking pose was chosen. For D15, there were two types of binding modes, one occupied 23% and others were 72%. Hence, two poses of D15 were chosen as initial structures for MD simulation. + +The RMSD of the receptor in FGFR1/D12 and FGFR1/D15 complexes was converged after 10e20 ns in 50 ns MD simulations, and the RMSD of backbone atoms was z 1e2 Å and z 4e5 Å, respectively [\(Fig. 4](#page-5-0)AeB). Binding energies for D12/D15 were calculated by MM-GBSA using 500 snapshots from 40 to 50 ns stable trajectories. Two poses of D15 were carried out for MD simulation. One showed 43.8067 ± 2.6494 kcal/mol binding energy, another was 26.9956 ± 3.6117 kcal/mol. Therefore, D12 and the significantly lower binding energy (high binding affinity) pose of D15 was chosen for further energy decomposition. In [Fig. 4](#page-5-0)C and D, the results showed that D12 and D15 shared seven identical key residues: PHE498, LEU630, ALA640, MET535, ASP641, VAL492 and ILE545. Interestingly, except ASP641, all these residues were hydrophobic amino acids, indicating that D12 and D15 likely interact primarily with non-polar residues in FGFR1 to exert their inhibitory activity. Kinases can exist as active and/or inactive states in cells. The hydrophobic residues in kinases interact to form "hydrophobic clusters" that stabilize the inactive conformation and interfere with ATP binding [\[26,33\]](#page-14-0). Targeting the auto-inhibited conformation (inactive conformation) of the kinase by the inhibitors represents an attractive approach due to the possibility of discovering more selective inhibitor [\[33\].](#page-14-0) For example, ARQ069, previously identified to be a non-ATP-competitive inhibitor of FGFR1/2, binds and inhibits the auto-inhibited conformation of FGFR1, and displays favorable kinase selectivity [\[28\]](#page-14-0). + +[Fig. 5A](#page-6-0) shows the whole molecule of D12 was sandwiched in a hydrophobic cleft between the residues PHE489, ALA640, LEU630, VAL492, VAL561, IEL545, and PHE642. The phenyl ring of indolyl in D12 created p-p stacking with the benzene side chain of PHE489 to stabilize the binding conformation, and another phenyl ring occupied the main hydrophobic pocket by making hydrophobic contacts with residues ILE545, MET535, PHE642, and ALA640. The results indicated that D12 formed one hydrogen bond with the side chain of PHE642. We compared modeling of the binding of D12 into FGFR1 with the FGFR1-ARQ069 co-crystal complex structure. D12 was observed to possess highly similar interactions with ARQ069, while ARQ069 only made two hydrogen bond interactions with the kinase domain, the major binding feature of ARQ069 was hydrophobic interactions, other than hydrogen bond interactions. The simulation results of D15 was almost identical to D12 ([Fig. 5B](#page-6-0)). The left phenyl ring of D15 formed p-p stacking with PHE489 as well. In addition, the right phenyl ring was wrapped in a hydrophobic pocket comprised of VAL561, ILE545, ALA640, and MET535. An extraordinary phenomenon was that no hydrogen bond existed between D15 with the kinase domain, suggesting that D15 + +![](_page_4_Figure_10.jpeg) + +Fig. 3. D12 and D15 inhibit FGFR1 through a mechanism that is independent of the concentration of ATP. Selective ATP-competitive kinase assay of compounds D12 and D15 with FGFR1 through Caliper Mobility Shift Assay. The conversion data were fitted with Graphpad for global fitting. + +c1ccccc1 + +c1ccccc1 + +![](_page_5_Figure_1.jpeg) + +Fig. 4. Backbone RMSDs are shown as a function of time for FGFR1/D12 and FGFR1/D15 complexes structures at 50 ns? A) Time evolution of the RMSD of FGFR1 and D12 are shown with green lines and blue lines, respectively. (B) Time evolution of the RMSD of FGFR1 and D15 are shown with green lines and blue lines, respectively. (C, D) Per-residue of top 10 contribution to the binding effective energy of D12/D15. Per-residue contributions were calculated by the MM-GBSA decomposition method. (For interpretation of the references to colour in this figure legend, the reader is referred to the web version of this article.) + +c1ccccc1 + +c1ccccc1 + +c1ccccc1 + +c1ccccc1 + +![](_page_6_Figure_1.jpeg) + +Fig. 5. Molecular dynamics analysis of D12/D15 to the activity cavity of FGFR1. (A) Last snapshot of FGFR1/D12 in 50ns MD simulations. (B) Last snapshot of FGFR1/D15 in 50ns MD simulations. + +interacted with FGFR1 kinase by only forming stable hydrophobic interactions. In summary, a critical hydrophobic interaction between D12 (and D15) with FGFR1 kinase domain was predicted by a rational method combined with molecular docking and molecular dynamics simulation. These methods clarified the interaction mechanisms of D12 and D15 with FGFR1. The further verification of co-crystallization of FGFR1-D12 and FGFR1-D15 is in progress. + +# 2.6. D12 and D15 suppress the growth of GC cell lines + +Aberrant activation of FGFR1 is correlated with tumor growth and development in a number of different cancers, including gastric cancer. The most potent compounds for evaluation of GC growth were selected based on their IC50 values. Most compounds showed favorable inhibitory activities using single concentration screening, with only four compounds showing relative low kinase IC50 (<10 mM) [\(Fig. 6\)](#page-7-0). The effect of the four FGFR1 kinase inhibitors (C5, D12, D14, and D15) on three GC cell lines was determined using the MTT assay. The GC cell lines were treated with each of the compounds, or with A114, A117, AZD4547 and Nordihydroguaiaretic acid (NDGA), a non-ATP competitive FGFR1 inhibitor, as the positive controls for 72 h. Among all tested compounds, D12 and D15 exhibited the greatest inhibitory activity of the MGC803 and BGC823 cell lines, with IC50 values of D12 and D15 lower than 10 mM in growth inhibition, and the anti-proliferative efficacies greater than their leads [\(Fig. 6](#page-7-0)). + +# 2.7. D12 and D15 inhibit the phosphorylation of FGFR1 and the downstream signaling + +FGF binding to FGFR1 induces the phosphorylation of FGFR1 and activates downstream pathways, including extracellular signalregulated kinase (ERK). ERK is a member of mitogen activated protein kinase (MAPK) and crucial for cellular proliferation and differentiation [\[34\]](#page-14-0). We determined the inhibitory effects of D12 and D15 on the phosphorylation of FGFR1 and ERK by Western blot + +c1ccccc1 + +c1ccccc1 + +| 化合物 | SMILES | 分子量 | +|-------|-------------|--------| +| 化合物-1 | C1CCCCC1 | 161.23 | +| 化合物-2 | C1CCC(CC1)O | 270.89 | + +Fig. 6. Active compounds (C5, D12, D14 and D15) effectively inhibited the FGFR1 kinase activity and the proliferation of three GC cells (SGC7901, MGC803, and BGC823). The IC50 values of the tested compounds on both enzyme and cell level were as shown. Four reported FGFR inhibitor A114, A117, AZD4547 and NDGA were used as a comparison in MTT assay. N.D. meant not determined. GC cells (3 103 cells/well; 96-well plates) was pretreated with compounds for 72 h, then the MTT assay gives the respective IC50 values of each compounds. The data were shown as a mean of 3e5 independent tests. + +analysis. Fig. 7 showed that D12 and D15 significantly suppressed the FGF2-induced phosphorylation of the threonine (T653/654) of FGFR1 and ERK activation in a dose-dependent manner. + +# 2.8. D12 and D15 arrest the cell cycle at G0/G1 phase + +We further examined whether D12 and D15 possessed cell cycle arrest potential. The SGC7901 cells were treated with different concentrations of D12 and D15 (5 mM, 10 mM, 20 mM) for 8 h, and cell cycle population distribution was measured by flow cytometry. Results indicated that D12 and D15 both induced cell cycle G0/G1 phase arrest in a concentration-dependent manner, and dramatically reduced the cell population of the G2/M phase ([Fig. 8A](#page-8-0)). The up-regulation of cyclin D1 can be an oncogene driving the cell-cycle regulation [\[35\],](#page-14-0) which has been reported to overexpressed in gastric cancer cells [\[36\]](#page-14-0). Therefore, we determined the expression level of cyclin D1 by Western blot analysis. Results indicated that both D12 and D15 down-regulated cyclin D1 expression in a dosedependent manner ([Fig. 8](#page-8-0)B). In summary, D12 and D15 induced cell cycle arrest of gastric cancer cells, which was attributed to at least in part by reducing cyclin D expression. + +# 2.9. D12 and D15 induce apoptosis of GC cell lines + +We determined whether the GC cell death induced by D12 and D15 was attributed to apoptosis through the mitochondriamediated apoptotic pathway. The morphological changes of cell nuclei was determined by Hoechst staining, and the expression of apoptosis-related molecules, i.e., cleaved-PARP and anti-apoptotic factor Bcl-2, was detected by Western blot analysis. Following 12 h treatment of GC cells with compounds, Hoechst staining showed chromatin condensation (strong blue fluorescence) and nucleus fragmentation ([Fig. 9](#page-9-0)A). At increasing higher concentrations of the compounds concentration, chromatin condensation developed in a concentration-dependent manner. For Western blot analysis, SGC7901 cells were incubated with different concentrations of D12, D15 (5 mM, 10 mM, 20 mM) or 10 mM of A117. Results indicated that the expression of cleaved-PARP was increased, which was accompanied by decreased expression of Bcl-2, both in a dosedependent manner [\(Fig. 9B](#page-9-0)). Note that as low as 5 mM of D12 and D15 was sufficient to induce changes of the protein expression. Compared with A117, D12 and D15 achieved the similar effects at 10 mM [\(Fig. 9](#page-9-0)B). These results indicated that D12 and D15 were effective inducers of GC cells apoptosis. + +## 3. Conclusion + +In summary, with A114 and A117 as lead compounds, four series of derivatives were designed and synthesized. Among them, D12 and D15 were screened by kinase inhibition assay, and identified as non-ATP competitive inhibitors of the kinase domain of FGFR1. QSAR analysis confirmed that the inhibition potency was highly + +![](_page_7_Figure_12.jpeg) + +Fig. 7. Compounds D12 and D15 suppressed bFGF-induced phosphorylation of FGFR1 and the downstream ERK1/2 in a concentration-dependent manner. SGC7901 cells were incubated with compounds D12 (5, 10, 20 mM), D15 (5, 10, 20 mM), 10 mM A117 or DMSO respectively, and then stimulated with bFGF; cell lysates was collected and the phosphorylation levels of FGFR1 and ERK1/2 were determined by the western blot assay. + +c1ccccc1 + +c1ccccc1 + +c1ccccc1 + +![](_page_8_Figure_1.jpeg) + +Fig. 8. Compound D12 and D15 induced cell cycle arrest at the G0/G1 phase in SGC7901 cells. (A) Cell cycle analysis. Cells was treated with various concentrations of D12 and D15 for 8 h, then subjected to flow cytometric analysis for the distributions at each phase of cells. (B) Western blot analysis of cell cycle protein Cyclin D1. Data are presented as the mean ± SD of three independent experiments conducted in triplicate. \*P < 0.05; \*\*P < 0.01; \*\*\*P < 0.001. + +correlated with the skeleton structure and special substituents in the synthetic compounds. Molecular docking and molecular dynamics simulations analyses demonstrated that the binding mode of D12 and D15 with the kinase domain of FGFR1 was most likely hydrophobic interactions. Moreover, biological assays of antigastric cancer efficacy of D12 and D15 indicated effective inhibition of cell proliferation, apoptosis induction and cell cycle arrest. Based on these findings, we conclude that D12 and D15 are potential agents for treatment of gastric cancer. This work provided a reference for the discovery of novel FGFR1 inhibitors. + +# 4. Experimental section + +#### 4.1. Chemistry + +Reagents and solvents for the synthesis were commercially available and obtained from Sigma-Aldrich (St Louis, Missouri, USA) and Aladdin (Shanghai, China), which were used without further purification. Silica gel (GF254) for column chromatography (200e300 mesh) was obtained from Aladdin. Melting points were measured on a Fisher-Johns melting apparatus and were + +c1ccccc1 + +c1ccccc1 + +![](_page_9_Figure_2.jpeg) + +Fig. 9. (A) Morphological changes and Hoechst staining were observed in SGC7901 cells cultured with D12, D15 (5, 10, and 20 mM) for 12 h (200). The figures were representative of more than three separate experiments (\*P < 0.05, \*\*P < 0.01, \*\*\*P < 0.001). (B) Effect of D12 and D15 on the activation of PARP and inhibition of Bcl-2 in SGC7901 cells. SGC7901 cells were incubated with D12 and D15 (5, 10 and 20 mM, respectively) for 48 h. A117 (10 mM) was used as a positive control. The level of PARP and Bcl-2 after administration was assayed by the western blot analysis. The column figure was the normalized optical density as a percentage of the relevant total protein. + +uncorrected. The 1 H-NMR and 13C-NMR spectra data were recorded on a 600 MHz spectrometer (Bruker Corporation, Switzerland) with TMS as an internal standard. Part of the synthetic compounds had been screened for their anti-inflammatory properties, and the + +structural spectral data reported in our previous papers [\[30](#page-14-0)e[32\].](#page-14-0) In this article, we only listed the spectral data of unreported compounds (B7, B14, C1eC5, and D13-D15). High-resolution mass spectra (m/z) were recorded on a micrOTOF-Q II instrument. The + +c1ccccc1 + +c1ccccc1 + +purity of all compounds was detected by HPLC (column: Agilent Eclipse XDB C18 5 mm 4.6 mm 150 mm, flow: 1 mL/min, detected wavelength (DW): 420 nm or 450 nm, condition: methanol/water from 50/50 to 90/10). The retained time (TR) and purities of unreported compounds in this article were collected. + +# 4.1.1. General procedure for preparation of the intermediates 3a-3d + +A mixture of Vanillin (2, 10 mmol) and appropriate ketone (5 mmol, 1a: tetrahydro-4H-pyran-4-one, 1b: cyclohexanone, 1c: acetone) in ethanol (20 mL) was stirred at room temperature. HCl (gas) was bubbled into the solution to catalyze the reaction until it was completed. 4-Piperidone hydrochloride hydrate (1d) and Vanillin were dissolved in the mixture solvent of ethanol and water (10:1) different with others. The resulting mixture was cooled and poured into cold water (20 mL) to precipitate the product. The filter residue was purified by silica gel chromatography using PE/EA as eluent to afford the intermediates 3a-3d. + +# 4.1.2. General procedure for synthesis of compounds A1-A7 + +A solution of propionyl or isobutyryl chloride (3 mmol) in THF (2 mL) was added dropwise into a solution of 3a-3d (1 mmol) and Et3N (0.25 mL) in THF (10 mL) under the condition of ice-water bath (5e8 C). After being stirred overnight at room temperature, the solvent was removed under vacuum, water and CH2Cl2 were added. The mixture was extracted with dichloromethane twice and dried over MgSO4. The organic layer was concentrated in vacuum. The crude was refined by chromatography. + +# 4.1.3. Synthesis of the intermediates 5e and 5b + +A solution of Cyclopentanone (1e, 20 mmol) or cyclohexanone (1b), morpholine (4, 30 mmol) and 4-methylbenzenesulfonic acid (200 mg) in cyclohexane (20 mL) was heated to reflux at 90 C for 4 h. After cooling to room temperature, the mixture was washed with water, dried, and concentrated to obtain the enamine intermediates 5e or 5b as a brown oil, which was directly used for the next reaction. + +# 4.1.4. Synthesis of the intermediates 7e and 7b + +A mixture of 5e or 5b (10 mmol) and vanillin (10 mmol) was dissolved in ethanol (20 mL) and the resulting solution was stirred at 90 C for 2 h. The residue was concentrated under vacuum and purified by column chromatography to give an orange powder. The powder was re-dissolved in ethanol (10 mL), and 10% HCl solution (4 mL) was added. After being stirred at room temperature for 3 h, distilled water (20 mL) was poured into the reaction flask. A light yellow precipitate of 7e and 7b was collected and washed with water. + +# 4.1.5. General procedure for synthesis of compounds 8e1-8e14 and 8b1-8b5 + +Different aromatic aldehydes (2 mmol) were dissolved into a suspension of the intermediates 7e and 7b (2 mmol) in ethanol (10 mL). Subsequently, HCl gas was bubbled into the mixture for 30 min, and the resulting mixture was stirred at 50e70 C for 2e3 h. After removal of ethanol, water (10 mL) and ethyl ester (20 mL) were poured into the reaction solution. The organic solvent was removed under reduced pressure and the residue was purified by chromatography to obtain the products 8e1-8e14 and 8b1-8b5. + +# 4.1.6. General procedure for synthesis of compounds B1eB14 + +A mixture of compounds 8e1-8e14 (1 mmol) and Et3N (0.25 mL) dissolved in THF (10 mL) was cooled to 0 C. A solution of propionyl chloride or isobutyryl chloride (3 mmol) was added dropwise. The mixture was allowed to warm at room temperature and stirred overnight. After evaporation of the solvent, the mixture was extracted with CH2Cl2, washed with water, and concentrated under vacuum. The desired compounds B1eB14 were eluted with petroleum ether and ethyl acetate by column chromatography, with the final product as various color powders. + +4.1.6.1. 2-Methoxy-4-{(E)-{(E)-2-oxo-3-[4-(pyrrolidin-1-yl)benzylidene]cyclopentylidene}me-thyl}phenyl propionate (B7). Orange yellow powder, 6.5% yield, mp: 200.8e202.9 C. HPLC purity: 98.756% (TR ¼ 8.789 min, DW ¼ 420 nm). 1 H-NMR (500 MHz, CDCl3) d: 7.595 (s, 1H, b0 -H), 7.528 (d, J ¼ 8.5 Hz, 2H, H-20 , H-60 ), 7.492 (s, 1H, b-H), 7.207 (d, J ¼ 8.5 Hz, 1H, H-6), 7.171 (s, 1H, H-2), 7.084 (d, J ¼ 8.0 Hz, 1H, H-5), 6.593 (d, J ¼ 8.5 Hz, 2H, H-30 , H-50 ), 3.872 (s, 3H, 3-OCH3), 3.370 (t, J ¼ 7.0 Hz, 4H, CH2eNeCH2), 3.082 (s, 4H, 300eCH2, 400eCH2), 2.326 (t, J ¼ 7.5 Hz, 2H, eCOCH2CH3), 2.038 (t, J ¼ 6.5 Hz, 2H, NeCH2eCH2 2), 1.111 (t, J ¼ 7.0 Hz, 3H, eCOCH2CH3). 13C-NMR (125 MHz, CDCl3) d: 195.837, 173.063, 172.415, 151.146, 148.687, 140.369, 138.547, 135.776, 135.198, 133.140, 131.510, 123.034, 122.993, 114.543, 111.842, 55.918, 47.559, 29.688, 26.263, 25.4533, 9.631. HRMS (ESI): calcd for C27H29NO4 [MþH]þ: 432.2175, found: 432.2183. + +4.1.6.2. 2-Methoxy-4-{(E)-{(E)-2-oxo-3-[4-(pyrrolidin-1-yl)benzylidene]cyclopentylidene}me-thyl}phenyl isobutyrate (B14). Orange yellow powder, 12.0% yield, mp: 205.8e206.9 C. HPLC purity: 94.308% (TR ¼ 11.258 min, DW ¼ 420 nm). 1H-NMR (500 MHz, CDCl3) d: 7.575 (s, 1H, b0 -H), 7.512 (d, J ¼ 9.0 Hz, 2H, H-20 , H-60 ), 7.488 (s, 1H, b-H), 7.204 (d, J ¼ 8.0 Hz, 1H, H-6), 7.161 (s, 1H, H-2), 7.069 (d, J ¼ 8.0 Hz, 1H, H-5), 6.697 (d, J ¼ 8.5 Hz, 2H, H-30 , H-50 ), 3.860 (s, 3H, 3-OCH3), 3.424 (t, J ¼ 7.0 Hz, 4H, CH2eNeCH2), 3.075 (s, 4H, 300eCH2, 400eCH2), 2.823e2.879 (m, 1H, eCOCH(CH3)2), 2.000e2.020 (m, 4H, NeCH2eCH2 2), 1.211 (d, J ¼ 7.0 Hz, 3H, eCOCH(CH3)2). 13C-NMR (125 MHz, CDCl3) d: 184.589, 169.867, 145.598, 142,875, 134.630, 133.366, 131.531, 129.792, 129.683, 127.811, 125.712, 117.734, 117.243, 109.147, 105.723, 50.640, 38.114, 28.707, 23.520, 23.045, 17.777. HRMS (ESI): calcd for C28H31NO4 [MþH]þ: 446.2330, found: 446.2330. + +# 4.1.7. General procedure for synthesis of compounds C1eC5 + +With the intermediates 8b1-8b5 as starting material, the target compounds C1eC5 were synthesized by applying the same method described for synthesis of B1eB14. + +4.1.7.1. 2-Methoxy-4-{(E)-[(E)-3-(4-morpholinobenzylidene)-2 oxocyclohexylidene]methyl}ph-enyl propionate (C1). Yellow powder, 5.4% yield, mp: 186.1e189.4 C. HPLC purity: 96.795% (TR ¼ 8.568 min, DW ¼ 420 nm). 1 H-NMR (500 MHz, CDCl3) d: 7.760 (s, 2H, b-H, b0 -H), 7.473 (d, J ¼ 8.5 Hz, 2H, H-20 , H-60 ), 7.279 (s, 1H, H-2), 7.056e7.073 (m, 2H, H-5, H-6), 6.939 (d, J ¼ 8.5 Hz, 2H, H-30 , H-50 ), 3.883 (t, J ¼ 4.5 Hz, 4H, CH2eOeCH2), 3.846 (s, 3H, 3-OCH3), 3.270 (t, J ¼ 3.0 Hz, 4H, CH2eNeCH2), 2.948 (d, J ¼ 2.0 Hz, 4H, 300eCH2, 500eCH2), 2.646 (q, J ¼ 2.5 Hz, 2H, eCOCH2CH3), 1.823 (m, 2H, 400eCH2), 1.297 (t, J ¼ 2.5 Hz, 3H, eCOCH2CH3). 13C-NMR (125 MHz, CDCl3) d: 184.74, 167.17, 145.61, 134.64, 132.15, 131.23, 130.31, 129.65, 127.02, 117.44, 117.37, 109.26, 109.18, 61.36, 50.65, 43.00, 23.37, 23.05, 22.10, 17.69. HRMS (ESI): calcd for C28H31NO5 [MþH]þ: 462.2278, found: 462.2278. + +4.1.7.2. 2-Methoxy-4-((E)-((E)-2-oxo-3-(4-(piperidin-1-yl)benzylidene)cyclohexylidene)methyl) phenyl propionate (C2). Orange yellow powder, 19.4% yield, mp: 126.2e129.0 C. HPLC purity: 97.949% (TR ¼ 10.122 min, DW ¼ 420 nm). 1 H-NMR (500 MHz, CDCl3) d: 7.758 (s, 1H, b0 -H), 7.732 (s, 1H, b-H), 7.437 (d, J ¼ 9.0 Hz, 2H, H-20 , H-60 ), 7.051 (s, 1H, H-2), 7.041 (d, J ¼ 7.5 Hz, 1H, H-6), 7.031 (d, J ¼ 7.5 Hz, 1H, H-5), 6.905 (d, J ¼ 9.0 Hz, 2H, H-30 , H-50 ), 3.843 (s, 3H, 3-OCH3), 3.289 (t, J ¼ 6.0 Hz, 4H, CH2eNeCH2), 2.946 (t, J ¼ 6.0 Hz, + +c1ccccc1 + +c1ccccc1 + +c1ccccc1 + +2H, 300eCH2), 2.905 (t, J ¼ 5.0 Hz, 2H, 500eCH2), 2.627 (q, J ¼ 7.5 Hz, 2H, eCOCH2CH3), 1.802 (t, J ¼ 5.5 Hz, 2H, 400eCH2), 1.693 (t, J ¼ 4.5 Hz, 4H, NeCH2eCH2 2), 1.631 (t, J ¼ 5.0 Hz, 2H, Ne CH2eCH2eCH2), 1.283 (t, J ¼ 7.5 Hz, 3H, eCOCH2CH3). 13C-NMR (125 MHz, CDCl3) d: 189.996, 172.467, 151.749, 150.889, 139.875, 138.032, 136.707, 135.275, 135.048, 132.484, 125.710, 122.701, 122.626, 114.656, 114.466, 55.940, 49.182, 28.712, 28.340, 27.385, 25.501, 24.338, 23.303, 9.148. HRMS (ESI): calcd for C29H33NO4 [MþH]þ: 460.2488, found: 460.2490. + +4.1.7.3. 4-{(E)-{(E)-3-[4-(Diethylamino)benzylidene]-2 oxocyclohexylidene}methyl}-2-meth-oxyphenyl propionate (C3). Orange yellow powder, 31.6% yield, mp: 141.7e144.2 C. HPLC purity: 97.569% (TR ¼ 8.019 min, DW ¼ 420 nm). 1 H-NMR (500 MHz, CDCl3) d: 7.776 (s, 1H, b0 -H), 7.733 (s, 1H, b-H), 7.448 (d, J ¼ 8.5 Hz, 2H, H-20 , H-60 ), 7.048 (s, 1H, H-2), 7.040 (d, J ¼ 7.5 Hz, 1H, H-6), 7.033 (d, J ¼ 7.5 Hz, 1H, H-5), 6.680 (d, J ¼ 8.5 Hz, 2H, H-30 , H-50 ), 3.844 (s, 3H, 3-OCH3), 3.408 (q, J ¼ 7.0 Hz, 4H, CH2eNeCH2), 2.951 (t, J ¼ 5.5 Hz, 2H, 300eCH2), 2.898 (t, J ¼ 5.5 Hz, 2H, 500eCH2), 2.2628 (q, J ¼ 7.5 Hz, 2H, eCOCH2CH3), 1.805 (t, J ¼ 6.0 Hz, 2H, 400eCH2), 1.283 (t, J ¼ 7.5 Hz, 3H, eCOCH2CH3), 1.199 (t, J ¼ 7.0 Hz, 6H, NeeCH2eCH3 2). 13C-NMR (125 MHz, CDCl3) d: 184.623, 167.213, 145.555, 134.470, 133.447, 131.573, 129.874, 129.670, 127.831, 117.373, 117.300, 109.133, 105.742, 50.638, 39.132, 23.518, 23.042, 22.097, 17.771. HRMS (ESI): calcd for C28H33NO4 [MþH]þ: 448.2488, found: 448.2506. + +4.1.7.4. 2-Methoxy-4-{(E)-[(E)-3-(4-morpholinobenzylidene)-2 oxocy-clohexylidene]methyl} phenylisobutyrate (C4). Yellow powder, 8.1% yield, mp: 165.7e168.1 C. HPLC purity: 93.093% (TR ¼ 8.459 min, DW ¼ 420 nm). 1 H-NMR (500 MHz, CDCl3) d: 7.756 (s, 1H, b0 -H), 7.738 (s, 1H, b-H), 7.463 (d, J ¼ 9.0 Hz, 2H, H-20 , H-60 ), 7.028e7.047 (m, 2H, H-2, H-5), 7.038 (t, 1H, H-6), 6.906 (d, J ¼ 8.5 Hz, 2H, H-30 , H-50 ), 3.873 (t, J ¼ 5.0 Hz, 4H, CH2eOeCH2), 3.837 (s, 3H, 3-OCH3), 3.255 (t, J ¼ 5.0 Hz, 4H, CH2eNeCH2), 2.902e2.953 (m, 4H, 300eCH2, 500eCH2), 2.821e2.877 (m, 1H, eCOCH(CH3)2), 1.783e1.830 (m, 2H, 400eCH2), 1.334 (d, J ¼ 7.0 Hz, 6H, eCOCH(CH3)2). 13C-NMR (125 MHz, CDCl3) d: 184.743, 169.843, 145.655, 143.809, 132.116, 131.181, 130.360, 129.563, 128.077, 127.008, 117.446, 117.315, 109.256, 109.197, 61.365, 50.651, 43.000, 28.708, 23.367, 23.050, 17.696, 13.739, 13.718, 13.688. HRMS (ESI): calcd for C29H33NO5 [MþH]þ: 462.2644, found: 462.2646. + +4.1.7.5. 2-Methoxy-4-{(E)-{(E)-2-oxo-3-[4-(piperidin-1-yl)benzylidene]cyclohexylidene}methyl} phenyl isobutyrate (C5). Orange yellow powder, 26.7% yield, mp: 158.5e161.6 C. HPLC purity: 93.671% (TR ¼ 12.264 min, DW ¼ 420 nm). 1 H-NMR (500 MHz, CDCl3) d: 7.758 (s, 1H, b0 -H), 7.733 (s, 1H, b-H), 7.437 (d, J ¼ 9.0 Hz, 2H, H-20 , H-60 ), 7.042e7.063 (m, 2H, H-2, H-5), 7.026 (s, 1H, H-6), 6.906 (d, J ¼ 8.5 Hz, 2H, H-30 , H-50 ), 3.834 (s, 3H, 3-OCH3), 3.288 (t, J ¼ 5.5 Hz, 4H, CH2eNeCH2), 2.944 (t, J ¼ 5.5 Hz, 2H, 300eCH2), 2.902 (t, J ¼ 5.5 Hz, 2H, 500eCH2), 2.819e2.875 (m, 1H, eCOCH(CH3)2), 1.801 (t, J ¼ 6.0 Hz, 2H, 400eCH2), 1.684e1.693 (m, 4H, NeCH2eCH2 2), 1.626 (d, J ¼ 4.5 Hz, 2H, NeCH2eCH2eCH2), 1.333 (d, J ¼ 7.0 Hz, 6H, eCOCH(CH3)2). 13C-NMR (125 MHz, CDCl3) d: 184.685, 169.844, 146.441, 145.627, 134.726, 132.693, 131.350, 130.031, 129.673, 127.192, 120.407, 117.416, 117.280, 109.356, 109.171, 50.646, 43.885, 28.710, 23.430, 23.057, 20.213, 19.050, 17.733, 13.750, 13.696. HRMS (ESI): calcd for C30H35NO4 [MþH]þ: 474.2644, found: 474.2654. + +# 4.1.8. General procedure for synthesis of compounds D1-D13 + +The preparation procedure of compounds D1-D13 was the same as compounds 8e1-8e14, which used the different substituted aromatic aldehyde for the aldol condensation reaction. + +4.1.8.1. (2E,6E)-2-(4-hydroxy-3-methoxybenzylidene)-6-(4-(pyrrolidi-1-yl)benzylidene)cyclo-hexanone (D13). Orange yellow powder, 15.2% yield, mp: 203.2e206.1 C. HPLC purity: 99.267% (TR ¼ 6.230 min, DW ¼ 450 nm). 1 H-NMR (500 MHz, DMSO-d6) d: 7.558 (d, J ¼ 7.5 Hz, 2H, H-20 , H-60 ), 7.436 (s, 1H, b0 -H), 7.419 (s, 1H, b-H), 7.100 (s, 1H, H-2), 7.013 (d, J ¼ 8.5 Hz, 1H, H-6), 6.851 (d, J ¼ 7.0 Hz, 1H, H-5), 6.595 (d, J ¼ 8.5 Hz, 2H, H-30 , H-50 ), 3.817 (s, 3H, 3-OCH3), 3.292 (t, J ¼ 6.5 Hz, 4H, CH2eNeCH2), 2.874 (d, J ¼ 5.0 Hz, 4H, 300eCH2, 500eCH2), 1.968 (t, J ¼ 6.5 Hz, 4H, NeCH2eCH2 2), 1.728 (t, J ¼ 5.5 Hz, 2H, 400eCH2). 13C-NMR (125 MHz, DMSO-d6) d: 188.028, 147.901, 147.612, 147.388, 137.017, 135.397, 133.754, 132.536, 130.769, 127.043, 124.032, 122.370, 115.490, 114.655, 111.547, 55.617, 47.171, 28.229, 27.919, 24.919, 22.631. HRMS (ESI): calcd for C25H27NO3 [MþH]þ: 390.2069, found: 390.2078. + +# 4.1.9. Preparation of the protected vanillin 10 + +A solution of Vanillin (2, 10 mmol), 3,4-2H-dihydropyran (9, 40 mmol) and pyridinium 4-toluenesulfonate (2.51 mmol) in CH2Cl2 (30 mL) was stirred at room temperature for 3 h. The resulting mixture was extracted with CH2Cl2 twice and concentrated in vacuum. The protected product 10 was purified by silica gel column chromatography as colorless oil. + +# 4.1.10. Preparation of the unilateral substituted intermediate 11 + +To a solution of compound 10 (5 mmol) in acetone (20 mL) a solution of 20% NaOH (1 mL) was added dropwise slowly at room temperature. TLC monitored the endpoint and removed the solvent. The residue was extracted with ethyl ester (20 mL) and washed water (10 mL) twice. After evaporating the ethyl ester, the mixture was purified by column chromatography to obtain the intermediate 11 as white powder. + +# 4.1.11. General procedure for synthesis of compounds D14-D15 + +To a mixture of dry powdered reactant 10 (1 mmol) and corresponding substituted benzaldehyde (1 mmol), a suspension of NaOH in dioxane (5g NaOH dispersed in 20 mL dioxane) was added dropwise until the reaction color turned wine red. The solution was stirred at room temperature for 4 h. The residue was concentrated and extracted with ethyl ester (20 mL). After purification by column chromatography, the colored powder was dissolved in ethanol (10 mL), and de-protected by 10% HCl solution (4 mL). The formed precipitates was filtered and purified by column chromatography to give the desired products D14-D15. + +4.1.11.1. (1E,4E)-1-(4-Hydroxy-3-methoxyphenyl)-5-[4-(pyrrolidin-1-yl)phenyl]penta-1,4-dien-3-one (D14). Red powder, 3.0% yield, mp: 204.3e207.4 C. HPLC purity: 97.934% (TR ¼ 7.402 min, DW ¼ 450 nm). 1 H-NMR (500 MHz, DMSO-d6) d: 7.559 (d, J ¼ 7.0 Hz, 2H, H-20 , H-60 ), 7.547 (d, J ¼ 15.5 Hz, 1H, b0 -H), 7.539 (d, J ¼ 15.5 Hz, 1H, b-H), 7.148 (s, 1H, H-2), 7.059 (d, J ¼ 8.0 Hz, 1H, H-6), 6.960 (d, J ¼ 16.0 Hz, 1H, a0 -H), 6.829 (d, J ¼ 15.0 Hz, 1H, a-H), 6.579 (d, J ¼ 8.0 Hz, 2H, H-30 , H-50 ), 6.504 (d, J ¼ 7.0 Hz, 1H, H-5), 3.769 (s, 3H, 3-OCH3), 3.308 (d, J ¼ 6.5 Hz, 4H, CH2eNeCH2), 1.974 (s, 4H, NeCH2eCH2 2). 13C-NMR (125 MHz, DMSO-d6) d: 187.574, 149.248, 149.199, 147.938, 143.235, 141.853, 130.341, 130.196, 126.513, 123.319, 123.078, 121.503, 120.195, 115.646, 111.762, 111.450, 55.723, 47.423, 24.911. HRMS (ESI): calcd for C22H23NO3 [MþH]þ: 350.1756, found: 350.1760. + +4.1.11.2. (1E,4E)-1-[4-(Diethylamino)phenyl]-5-(4-hydroxy-3 methoxyphenyl)penta-1,4-dien-3-one (D15). Red powder, 7.8% yield, mp: 134.7e135 C. HPLC purity: 99.900% (TR ¼ 6.311 min, DW ¼ 450 nm). 1 H-NMR (500 MHz, DMSO-d6) d: 7.624 (d, J ¼ 15.5 Hz, 1H, b0 -H), 7.592 (d, J ¼ 15.5 Hz, 1H, b-H), 7.567 (d, J ¼ 9.5 Hz, 2H, H-20 , H-60 ), 7.352 (s, 1H, H-2), 7.182 (d, J ¼ 8.0 Hz, 1H, + +c1ccccc1 + +c1ccccc1 + +H-6), 7.119 (d, J ¼ 15.5 Hz, 1H, a0 -H), 6.977 (d, J ¼ 15.5 Hz, 1H, a-H), 6.812 (d, J ¼ 8.0 Hz, 1H, H-5), 6.709 (d, J ¼ 8.0 Hz, 2H, H-30 , H-50 ), 3.848 (s, 3H, 3-OCH3), 3.414 (d, J ¼ 6.5 Hz, 4H, CH2eNeCH2), 1.124 (t, J ¼ 6.5 Hz, 6H, NeCH2eCH3 2). 13C-NMR (125 MHz, DMSO-d6) d: 187.586, 149.264, 149.216, 147.942, 143.006, 141.864, 130.527, 126.502, 123.295, 123.084, 121.242, 120.244, 115.651, 111.472, 111.224, 55.727, 43.759, 12.437. HRMS (ESI): calcd for C22H25NO3 [MþH]þ: 352.1912, found: 352.1928. + +#### 4.2. In vitro kinase assay + +Based on microfluidics chip technology, the kinase activity of FGFR1 was determined by Mobility Shift Assay on EZ Reader (Caliper Life Sciences, MA). As the experimental condition was constrained, the kinase activity screening and ATP competitive assay both were authorized to ChemPartner (Shanghai, China). The specific protocol is described briefly as follows: + +The enzyme solution was prepared in 1.25 kinase base buffer (62.5 mM HEPES, 0.001875% Brij-35, 12.5 mM MgCl2, 2.5 mM DTT), stop buffer (100 mM HEPES, 0.015% Brij-35, 0.2% coating reagent No. 3, 50 mM EDTA) The compound was dissolved in DMSO, and diluted to the specific concentration with water in 384-well plate. 5 mL of compound solution was diluted with 10 mL of 2.5 enzyme solution. 10 mM EDTA was used as the blank control. The mixture was incubated at room temperature for 10 min. 10 mL of peptide solution (2.5, FAM-P22, add FAM-labeled peptide and ATP in the 1.25 kinase base buffer) was added to each well of the 384-well assay plate. After incubation at 28 C for 1 h, 25 mL of stop buffer was added into the mixture to stop the reaction. Conversion data were collected on Caliper EZ reader (Hopkinton, MA), the percent inhibition of kinase activity was calculated based on the formula below: + +# Inhibition % ¼ (max conversion)/(max min) 100%, + +"Max" stands for DMSO control; "Min" stands for low control. + +To determine the half-maximal inhibitory concentration of the compounds relative to kinase activity, 10 gradient concentrations of compounds (100, 33.33, 11.11, 3.70, 1.23, 0.41, 0.14, 0.046, 0.015, and 0.005 mM) were set up. The inhibition ratios of different concentrations relative to kinase were calculated and the concentrationinhibition rate curve was fitted by GraphPad Prism. + +#### 4.3. Quantitative structure-activity relationships analysis + +## 4.3.1. Materials and descriptors selection + +All products were drawn and saved as SDF format. The compounds were calculated by the ChemoPy descriptor calculation program [\[37,38\].](#page-14-0) After calculation of molecular descriptors, about 1000 molecular descriptors based on molecular structure were obtained. In this study, the descriptors include the constitutional descriptors, physicochemical descriptors, topological descriptors, geometrical descriptors, charge (electronic) descriptors, etc. After calculation of the molecular descriptors, those that were constant descriptors for all molecules were eliminated and pairs of variables with a correlation coefficient greater than 0.75 were classified as inter-correlated and one in each correlated pair was deleted. + +#### 4.3.2. Multiple linear regression (MLR) analysis + +Multiple linear regression (MLR) analysis is a statistical technique that uses several explanatory variables to predict the outcome of a response variable. The goal of MLR is to model the relationship between the explanatory and response variables. In our present study, MLR was performed using R program, a widely used tool for statistical computing and graphics, to derive QSAR models. The biological data used in this study were the FGFR1 inhibitory rates compared with negative group. Compounds with negative values were abandoned because of their anti-cancer activities. The inhibition rates against FGFR1 were used as dependent variables in the linearization procedure. Subsequently, stepwise Multiple Linear Regression (Stepwise-MLR) was used to select the significant descriptors. The most relevant descriptors were used as independent variables. + +#### 4.3.3. Validation of the models + +To confirm that the lineal model was reliable, testing the predictive ability and generalizing the methods by cross-validation is required. The leave-one-out (LOO) procedure was employed. When a data point was removed from the analyzed set, the regression was recalculated and the predicted value for that point was compared to its actual value. This process was repeated until each datum had been omitted once and the sum of squares of these deletion residuals could be used to calculate q2 , an equivalent statistic to R2 . + +#### 4.4. ATP competitive inhibition assay + +With the use of kinase inhibitory activity screening assay, four gradient concentrations of compounds to be measured were set up. At each concentration of different compounds, eight concentrations of ATP (4192, 2096, 1048, 524, 262, 131, 66, and 33 mM) were established to determine the conversion of substrate peptide catalyzed by FGFR1 kinase within 1 h. The inhibition curve was fitted by GraphPad Prism. + +#### 4.5. Molecular docking + +The binding pose of compound D12 and D15 in FGFR1's binding site was predicted by the software Autodock (version 4.2.6) [\[39\].](#page-14-0) The crystallographic co-ordinates for human wild type FGFR1 (PDB ID: 3RHX) was retrieved from the Protein Data Bank (PDB). Prior to docking, protein structure was prepared by removing water molecules and other ligands using PyMol software [\[40\]](#page-14-0). Individual ligand files were prepared for docking using the prepare\_ligand4.py script, then charges and hydrogen atoms of FGFR1 protein were added using the prepare\_receptor4.py script from last version of AutoDockTools 1.5.6. A grid box size of 60 60 60 dimensions, with a spacing of 0.375 Å between the grid points, was implemented and covered almost the entire FGFR1 binding site. The grid parameter files were created by setting up of map files directly. The Lamarckian genetic algorithm (LGA) was applied to deal with the protein and inhibitors interactions. The docking parameters were as follows: trials of 100 dockings, the number of individuals in population was set to 300, the crossover rate was 0.8, and the local search rate was 0.06 and 25 million of maximum number of energy evaluations. Other settings were set to default. AutoDockTools and PyMol was used to analyze the docking results. + +#### 4.5.1. Molecular dynamics (MD) simulations + +Molecular dynamics (MD) simulations were performed to verify whether the docking results are stable enough. Molecular dynamics (MD) simulations of D12/FGFR1 complex system and D15/FGFR1 system were carried out. The initial complexes came from the top ranked binding pose, cluster by the docking results with the root mean square deviations (RMSD ¼ 2) of each pose. The partial charges of the D12 and D15 were derived by using the restrained electrostatic potential (RESP) fitting procedure based on the electrostatic potentials calculated by Hartree-Fock (HF) method with 6- 31G (d) basis set in the Gaussian09 Package [\[41\]](#page-14-0). Molecular mechanics parameters, from the ff99SB and GAFF force fields using the LEaP module of AMBER 11 software packages, were assigned to the + +c1ccccc1 + +c1ccccc1 + +c1ccccc1 + +protein and the ligand respectively [\[42,43\].](#page-14-0) All the water molecules and the specified ligand were deleted. The two systems were all solvated in a box of TIP3P water molecules with a hydration shell of 10 Å. Additionally, an appropriate number of sodium ions was used + +The AMBER 11 software package was employed, using the same protocol, for all simulations [\[42\]](#page-14-0). We carried out an equilibration protocol consisting of an initial minimization of water box of 5000 steps, 2500 steps for the steepest descent and 2500 steps in the conjugate gradient. Then, the TIP3P water box was heated at constant volume until 300 K using a time constant for the heat bath with a coupling time of 100 ps? Equilibration was at 300 K under constant pressure of 100 ps? Before the production of MD, the whole system was equilibrated at 100 ps at a constant pressure of 1 bar, and the Langevin temperature scaling was turned on with a collision frequency of 2 ps [\[44\].](#page-14-0) The last step used 50 ns of MD simulation without any restraints. In addition, short-range nonbonded interactions were calculated based on a cutoff of 8.0 Å. Periodic Boundary Conditions were turned on in every step of the whole process. Long-range electrostatic interactions were handled by the Particle Mesh Ewald (PME) algorithm [\[45\]](#page-14-0). The SHAKE method was used to constrain hydrogen atoms and the time step was set to 2 fs [\[46\]](#page-14-0). The coordinates were saved every 10 ps for the subsequent analysis. + +#### 4.5.2. Binding free energy calculations and decomposition analysis + +The binding free energies were determined by the molecular mechanics/generalized Born surface area (MM-GBSA) method as implemented in AMBER 11. As to each system, 500 snapshots from the last 10 ns MD trajectory were chosen for the calculations. The binding free energy was calculated according to the equation: + +$$ +\Delta \mathbf{G}\_{\text{bind}} = \Delta \mathbf{G}\_{\text{complex}} - \left(\Delta \mathbf{G}\_{\text{receeptor}} + \Delta \mathbf{G}\_{\text{ligand}}\right) \tag{1} +$$ + +$$ +\Delta G\_{\text{bind}} = \Delta \mathbf{H} - \mathbf{T} \Delta \mathbf{S} \approx \Delta \mathbf{E}\_{\text{gas}} + \Delta \mathbf{G}\_{\text{sol}} - \mathbf{T} \Delta \mathbf{S} \tag{2} +$$ + +$$ +\Delta \mathbf{G}\_{\text{gas}} = \Delta \mathbf{G}\_{\text{VDW}} + \Delta \mathbf{G}\_{\text{elec}} \tag{3} +$$ + +$$ +\Delta \mathbf{G}\_{\text{sol}} = \Delta \mathbf{G}\_{\text{GB}} + \Delta \mathbf{G}\_{\text{SA}} \tag{4} +$$ + +In the equation (3), △Ggas between protein and ligand is the gas-phase interaction, including the van der Waals and the electrostatic energies; △GGB and △GSA are the polar and nonpolar components of the desolvation free energy, respectively; T△S, at temperature T, represents the change of conformational entropy upon ligand binding. + +#### 4.6. Biological activity experiments + +#### 4.6.1. Cell culture + +SGC7901 was obtained from Cell Bank of Chinese Academy of Sciences (Wuhan, China), and MGC823, BGC803 were bought from Shanghai Institute of Biosciences and Cell Resources Center (Chinese Academy of Sciences, Shanghai, China). All cells were grown in RMPI-1640 media (Gibco) supplemented with 10% FBS (Gibco), 1% Penicilin Streptomycin (Gibco) in a humidified ThermoForma (Thermo Fisher Scientific) containing 5% CO2 at 37 C. + +# 4.6.2. MTT cytotoxicity assay + +Cells (4 103 ) were seeded in a 96-well cell culture plate and treated with compounds for 72 h. Assay was made by adding 20 mL of methylthiazoletetrazolium (MTT; 0.5 mg/mL; M5655; Sigma) to the culture medium and incubated at 37 C for 4 h. At the end of incubation, the produced crystal was dissolved in 150 mL dimethyl + +# sulfoxide (sigma) and quantified at 490 nm. + +#### 4.6.3. Western blot analysis + +Treated cells were washed with PBS and harvested using RIPA Buffer (AR0103; BOSTER, Wuhan, China) with 1% PMSF (100 mM; ST506; Beyotime). Protein quantification was performed using the Bradford 1 Dye Reagent (#5000205; Bio-Rad). Cell extracts were separated by 10% SDS-PAGE and transferred to PVDF membrane (Millipore). Then the membrane was blocked for 90 min at room temperature in TBS with 0.1% Tween and 5% milk powder. The membrane was incubated with the following primary antibodies: Phospho-Flg (Tyr653/654; 1:300; sc-30262; Santa cruz Biotechnology, lnc.), p-ERK (1:300; sc-7383; Santa cruz Biotechnology, lnc.), cleaved PARP-1(194C1439; 1:300; sc-56196; Santa cruz Biotechnology, lnc.), cyclin D1 (1:300; sc-753 Santa cruz Biotechnology, lnc.), Bcl-2 (1:300; sc-492; Santa cruz Biotechnology, lnc.), or GAPDH (1:300; sc-25778; Santa cruz Biotechnology, lnc.). After washing, the membranes were incubated with the relevant secondary antibodies, visualization of bands was by enhanced chemiluminescence (ECL; Bio-Rad), and densitometric analysis made using Image J software. + +#### 4.6.4. Hoechst 33342 staining + +SGC7901 cells at logarithmic growth were seeded in 6-well cell culture plate and treated with DMSO at different concentrations of D12 or D15 or 10 mM A117 for 12 h. Following fixation with 4% paraformaldelyde for 10 min at room temperature (RT), cells were washed and stained with Hoechst 33342 (Beyotime) for 10 min. Cells were observed under a fluorescence microscope (Nikon, Tokyo, Japan) using appropriate filters for blue fluorescence. + +#### 4.6.5. Cell cycle analysis + +Cells (3 105 ) were treated with agents for 8 h, washed with PBS, collected, and fixed in 75% ice-cold ethanol for 4 h. After washing with PBS, the cells were stained with 500 mL of propidium iodide (PI) (Becton, Dickinson and Company) for 10 min at 4 C in the dark. The cell suspension was filtered with 200 - mesh gauze and subjected to a FACSCalibur instrument (BD Biosciences Clontech). Data were analyzed using the ModFit DNA analysis program. + +## 4.6.6. Statistical analysis + +The results were presented as the mean ± standard error (SEMs). The statistics were performed using Student's t-test in GraphPad Pro (GraphPad, San Diego, CA, USA). A value of P less than 0.05 (p < 0.05) was considered statistically significant. All experiments were repeated a minimum of three times. + +# Conflict of interest + +The authors have declared no conflict of interest. + +#### Acknowledgment + +This work was supported by the National Natural Science Foundation of China (Grant Nos. 81402839, 81272462, 81473242), the Natural Science Foundation of Zhejiang Province of China (LY17H160059), and the Technology Foundation for Medical Science of Zhejiang Province (Grant No. 2012KYA129). + +#### Appendix A. Supplementary data + +Supplementary data associated with this article can be found in the online version, at [http://dx.doi.org/10.1016/j.ejmech.2016.10.](http://dx.doi.org/10.1016/j.ejmech.2016.10.066) [066](http://dx.doi.org/10.1016/j.ejmech.2016.10.066). These data include MOL files and InChiKeys of the most important compounds described in this article. + +c1ccccc1 + +c1ccccc1 + +#### References + +- [1] [J. Ferlay, I. Soerjomataram, R. Dikshit, S. Eser, C. Mathers, M. Rebelo,](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref1) [D.M. Parkin, D. Forman, F. Bray, Cancer incidence and mortality worldwide:](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref1) [sources, methods and major patterns in GLOBOCAN 2012, Int. J. Cancer 136](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref1) [\(2015\) E359](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref1)e[E386.](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref1) +- [2] [R.L. Siegel, K.D. Miller, A. Jemal, Cancer statistics, 2015, CA-A Cancer J. Clin. 65](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref2) [\(2015\) 5](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref2)e[29](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref2). +- [3] [T. Oshima, M. Masuda, Molecular targeted agents for gastric and gastro](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref3)[esophageal junction cancer, Surg. Today 42 \(2012\) 313](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref3)e[327.](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref3) +- [4] [J. Wang, R. Xu, J. Li, Y. Bai, T. Liu, S. Jiao, G. Dai, J. Xu, Y. Liu, N. Fan, Y. Shu, Y. Ba,](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref4) [D. Ma, S. Qin, L. Zheng, W. Chen, L. Shen, Randomized multicenter phase III](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref4) study of a modifi[ed docetaxel and cisplatin plus](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref4) fluorouracil regimen compared with cisplatin and fluorouracil as fi[rst-line therapy for advanced or](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref4) [locally recurrent gastric cancer, Gastric Cancer 19 \(2016\) 234](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref4)e[244](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref4). +- [5] [S.S. Khakoo, A. Georgiou, J.S. Waters, A restrospective analysis of toxicities](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref5) [encountered with palliative epirubicin, oxaliplatin, and capecitabine \(EOX\)](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref5) [chemotherapy for advanced esophagogastric cancer, J. Clin. Oncol. 32 \(2014\)](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref5). +- [6] [E.C. Smyth, D. Cunningham, Targeted therapy for gastric cancer, Curr. Treat.](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref6) [Options Oncol. 13 \(2012\) 377](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref6)e[389](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref6). +- [7] [H. Arai, S. Hironaka, T. Suzuki, K. Sudo, K. Nakamura, K. Minashi, T. Hara,](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref7) [T. Denda, T. Yamaguchi, A retrospective study of early toxicity of weekly](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref7) [paclitaxel as second line chemotherapy for advanced gastric cancer, Ann.](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref7) [Oncol. 26 \(2015\), 132](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref7)e[132.](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref7) +- [8] [T. Aoyama, T. Yoshikawa, Targeted therapy: apatinib new third-line option](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref8) [for refractory gastric or GEJ cancer, Nat. Rev. Clin. Oncol. 13 \(2016\) 268](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref8)e[270](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref8). +- [9] [V. Brower, Apatinib in treatment of refractory gastric cancer, Lancet Oncol. 17](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref9) [\(2016\) e137](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref9). +- [10] [J. Zhang, P.L. Yang, N.S. Gray, Targeting cancer with small molecule kinase](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref10) [inhibitors, Nat. Rev. Cancer 9 \(2009\) 28](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref10)e[39](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref10). +- [11] [C. Hierro, J. Rodon, J. Tabernero, Fibroblast growth factor \(FGF\) receptor/FGF](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref11) [inhibitors: novel targets and strategies for optimization of response of solid](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref11) [tumors, Semin. Oncol. 42 \(2015\) 801](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref11)e[819.](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref11) +- [12] [M.H. Schafer, P. Lingohr, A. Strasser, N.C. Lehnen, M. Braun, S. Perner, T. Holler,](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref12) [G. Kristiansen, J.C. Kalff, I. Gutgemann, Fibroblast growth factor receptor 1](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref12) gene amplifi[cation in gastric adenocarcinoma, Hum. Pathol. 46 \(2015\)](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref12) [1488](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref12)e[1495](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref12). +- [13] [M.V. Dieci, M. Arnedos, F. Andre, J.C. Soria, Fibroblast growth factor receptor](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref13) [inhibitors as a cancer treatment: from a biologic rationale to medical per](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref13)[spectives, Cancer Discov. 3 \(2013\) 264](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref13)e[279.](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref13) +- [14] [M. Touat, E. Ileana, S. Postel-Vinay, F. Andre, J.C. Soria, Targeting FGFR](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref14) [signaling in Cancer, Clin. Cancer Res. 21 \(2015\) 2684](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref14)e[2694.](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref14) +- [15] [E.P. Carter, A.E. Fearon, R.P. Grose, Careless talk costs lives:](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref15) fibroblast growth [factor receptor signalling and the consequences of pathway malfunction,](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref15) [Trends Cell Biol. 25 \(2015\) 221](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref15)e[233](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref15). +- [16] [G. Liang, Z. Liu, J. Wu, Y. Cai, X. Li, Anticancer molecules targeting](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref16) fibroblast [growth factor receptors, Trends Pharmacol. Sci. 33 \(2012\) 531](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref16)e[541](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref16). +- [17] [A. Dutt, A.H. Ramos, P.S. Hammerman, C. Mermel, J. Cho, T. Sharifnia,](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref17) [A. Chande, K.E. Tanaka, N. Stransky, H. Greulich, N.S. Gray, M. Meyerson, In](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref17)hibitor-sensitive FGFR1 amplifi[cation in human non-small cell lung cancer,](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref17) [PLoS One 6 \(2011\) e20351](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref17). +- [18] [I. Tsimafeyeu, L. Demidov, E. Stepanova, N. Wynn, H. Ta, Overexpression of](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref18) fi[broblast growth factor receptors FGFR1 and FGFR2 in renal cell carcinoma,](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref18) [Scand. J. Urol. Nephrol. 45 \(2011\) 190](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref18)e[195.](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref18) +- [19] [V.K. Jain, N.C. Turner, Challenges and opportunities in the targeting of](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref19) fibro[blast growth factor receptors in breast cancer, Breast Cancer Res. 14 \(2012\)](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref19) [208](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref19). +- [20] [N. Turner, R. Grose, Fibroblast growth factor signalling: from development to](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref20) [cancer, Nat. Rev. Cancer 10 \(2010\) 116](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref20)e[129](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref20). +- [21] [G. Daniele, J. Corral, L.R. Molife, J.S. de Bono, FGF receptor inhibitors: role in](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref21) [cancer therapy, Curr. Oncol. Rep. 14 \(2012\) 111](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref21)e[119](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref21). +- [22] [N. Miyamoto, H. Yamamoto, C. Miyamoto, T. Maehata, K. Nosho, H. Taniguchi,](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref22) [K. Yamashita, Y. Adachi, Y. Arimura, F. Itoh, K. Imai, Y. Shinomura, Over](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref22)[expression of the receptor tyrosine kinase epha4 plays an important role in](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref22) [the progression of human gastric cancers, Gastroenterology 134 \(2008\)](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref22) [A611](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref22)e[A612](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref22). +- [23] [D. Wen, S. Li, F. Ji, H. Cao, W. Jiang, J. Zhu, X. Fang, miR-133b acts as a tumor](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref23) [suppressor and negatively regulates FGFR1 in gastric cancer, Tumor Biol. 34](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref23) [\(2013\) 793](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref23)e[803.](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref23) +- [24] [L. Xie, X. Su, L. Zhang, X. Yin, L. Tang, X. Zhang, Y. Xu, Z. Gao, K. Liu, M. Zhou,](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref24) [B. Gao, D. Shen, L. Zhang, J. Ji, P.R. Gavine, J. Zhang, E. Kilgour, X. Zhang, Q. Ji,](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref24) + +FGFR2 gene amplifi[cation in gastric cancer predicts sensitivity to the selective](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref24) [FGFR inhibitor AZD4547, Clin. Cancer Res. 19 \(2013\) 2572](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref24)e[2583](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref24). + +- [25] [K. Schmidt, C. Moser, C. Hellerbrand, D. Zieker, C. Wagner, J. Redekopf,](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref25) [H.J. Schlitt, E.K. Geissler, S.A. Lang, Targeting](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref25) fibroblast growth factor receptor [\(FGFR\) with BGJ398 in a gastric Cancer model, Anticancer Res. 35 \(2015\)](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref25) [6655](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref25)e[6665](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref25). +- [26] [Z. Zhao, H. Wu, L. Wang, Y. Liu, S. Knapp, Q. Liu, N.S. Gray, Exploration of type](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref26) [II binding mode: a privileged approach for kinase inhibitor focused drug](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref26) [discovery? ACS Chem. Biol. 9 \(2014\) 1230](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref26)e[1241](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref26). +- [27] [R.A. Norman, D. Toader, A.D. Ferguson, Structural approaches to obtain kinase](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref27) [selectivity, Trends Pharmacol. Sci. 33 \(2012\) 273](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref27)e[278.](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref27) +- [28] [S. Eathiraj, R. Palma, M. Hirschi, E. Volckova, E. Nakuci, J. Castro, C.R. Chen,](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref28) [T.C. Chan, D.S. France, M.A. Ashwell, A novel mode of protein kinase inhibition](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref28) [exploiting hydrophobic motifs of autoinhibited kinases: discovery of ATP](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref28)independent inhibitors of fi[broblast growth factor receptor, J. Biol. Chem.](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref28) [286 \(2011\) 20677](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref28)e[20687.](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref28) +- [29] [Y. Wang, Y. Cai, J. Ji, Z. Liu, C. Zhao, Y. Zhao, T. Wei, X. Shen, X. Zhang, X. Li,](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref29) G. Liang, Discovery and identifi[cation of new non-ATP competitive FGFR1](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref29) [inhibitors with therapeutic potential on non-small-cell lung cancer, Cancer](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref29) [Lett. 344 \(2014\) 82](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref29)e[89](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref29). +- [30] [Y. Zhang, L. Zhao, J. Wu, X. Jiang, L. Dong, F. Xu, P. Zou, Y. Dai, X. Shan, S. Yang,](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref30) [G. Liang, Synthesis and evaluation of a series of novel asymmetrical curcumin](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref30) analogs for the treatment of infl[ammation, Molecules 19 \(2014\) 7287](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref30)e[7307](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref30). +- [31] [Y.L. Zhang, X. Jiang, K.S. Peng, C.W. Chen, L.L. Fu, Z. Wang, J.P. Feng, Z.G. Liu,](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref31) [H.J. Zhang, G. Liang, Z. Pan, Discovery and evaluation of novel anti](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref31)infl[ammatory derivatives of natural bioactive curcumin, Drug Des. Dev.](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref31) [Ther. 8 \(2014\) 2161](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref31)e[2171.](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref31) +- [32] [L.L. Dong, S.Q. Zheng, Y.L. Zhang, X. Jiang, J.Z. Wu, X.Q. Zhang, X.O. Shan,](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref32) [D.D. Liang, S.L. Ying, J.P. Feng, G. Liang, Design, synthesis, and evaluation of](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref32) [semi-conservative mono-carbonyl analogs of curcumin as anti-in](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref32)flammatory [agents against lipopolysaccharide-induced acute lung injury, Med. Chem.](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref32) [Commun. 6 \(2015\) 1544](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref32)e[1553.](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref32) +- [33] [L.N. Johnson, Protein kinase inhibitors: contributions from structure to clinical](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref33) [compounds, Q. Rev. Biophys. 42 \(2009\) 1](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref33)e[40](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref33). +- [34] [J.L. Qu, X.J. Qu, M.F. Zhao, Y.E. Teng, Y. Zhang, K.Z. Hou, Y.H. Jiang, X.H. Yang,](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref34) [Y.P. Liu, Gastric cancer exosomes promote tumour cell proliferation through](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref34) [PI3K/Akt and MAPK/ERK activation, Dig. Liver Dis. 41 \(2009\) 875](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref34)e[880.](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref34) +- [35] [M. Fu, C. Wang, Z. Li, T. Sakamaki, R.G. Pestell, Minireview: cyclin D1: normal](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref35) [and abnormal functions, Endocrinology 145 \(2004\) 5439](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref35)e[5447.](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref35) +- [36] [W.K. Wu, C.H. Cho, C.W. Lee, D. Fan, K. Wu, J. Yu, J.J. Sung, Dysregulation of](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref36) [cellular signaling in gastric cancer, Cancer Lett. 295 \(2010\) 144](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref36)e[153](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref36). +- [37] [D.S. Cao, S. Liu, L. Fan, Y.Z. Liang, QSAR analysis of the effects of OATP1B1](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref37) [transporter by structurally diverse natural products using a particle swarm](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref37) [optimization-combined multiple linear regression approach, Chemom. Intell.](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref37) [Lab. Syst. 130 \(2014\) 84](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref37)e[90](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref37). +- [38] [D.S. Cao, Q.S. Xu, Q.N. Hu, Y.Z. Liang, ChemoPy: freely available python](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref38) [package for computational biology and chemoinformatics, Bioinformatics 29](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref38) [\(2013\) 1092](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref38)e[1094](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref38). +- [39] [G.M. Morris, R. Huey, W. Lindstrom, M.F. Sanner, R.K. Belew, D.S. Goodsell,](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref39) [A.J. Olson, AutoDock4 and AutoDockTools4: automated docking with selective](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref39) receptor fl[exibility, J. Comput. Chem. 30 \(2009\) 2785](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref39)e[2791.](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref39) +- [40] [W.L. DeLano, PyMOL molecular viewer: updates and re](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref40)finements, Abstr. Pap. [Am. Chem. Soc. 238 \(2009\).](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref40) +- [41] [M. Frisch, G. Trucks, H.B. Schlegel, G. Scuseria, M. Robb, J. Cheeseman,](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref41) [G. Scalmani, V. Barone, B. Mennucci, G.e. Petersson, Gaussian 09, Gaussian, Inc,](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref41) [Wallingford, CT, 2009](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref41). +- [42] [D.A. Pearlman, D.A. Case, J.W. Caldwell, W.S. Ross, T.E. Cheatham, S. DeBolt,](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref42) [D. Ferguson, G. Seibel, P. Kollman, AMBER, a package of computer programs](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref42) [for applying molecular mechanics, normal mode analysis, molecular dynamics](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref42) [and free energy calculations to simulate the structural and energetic prop](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref42)[erties of molecules, Comput. Phys. Commun. 91 \(1995\) 1](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref42)e[41.](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref42) +- [43] [J. Wang, R.M. Wolf, J.W. Caldwell, P.A. Kollman, D.A. Case, Development and](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref43) testing of a general amber force fi[eld, J. Comput. Chem. 25 \(2004\) 1157](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref43)e[1174](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref43). +- [44] [J.A. Izaguirre, D.P. Catarello, J.M. Wozniak, R.D. Skeel, Langevin stabilization of](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref44) [molecular dynamics, J. Chem. Phys. 114 \(2001\) 2090](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref44)e[2098.](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref44) +- [45] [C. Sagui, T.A. Darden, Molecular dynamics simulations of biomolecules: long](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref45)[range electrostatic effects, Annu. Rev. Biophys. Biomol. Struct. 28 \(1999\)](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref45) [155](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref45)e[179.](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref45) +- [46] [A.G. Bailey, C.P. Lowe, MILCH SHAKE: an ef](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref46)ficient method for constraint dy[namics applied to alkanes, J. Comput. Chem. 30 \(2009\) 2485](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref46)e[2493.](http://refhub.elsevier.com/S0223-5234(16)30937-0/sref46) + +c1ccccc1 + +c1ccccc1 + +c1ccccc1 \ No newline at end of file diff --git a/run_app.py b/run_app.py new file mode 100644 index 00000000..e8855333 --- /dev/null +++ b/run_app.py @@ -0,0 +1,501 @@ +from __future__ import annotations + +import asyncio +import logging +# import torch.multiprocessing as multiprocessing +# multiprocessing.set_start_method('spawn') +import multiprocessing +import torch +import tempfile +import json +try: + torch.multiprocessing.set_start_method('spawn') +except: + pass +import os +import signal +import sys +import threading +import traceback +from multiprocessing import Value +from typing import Any, Dict, List, NamedTuple + +import aiohttp +import requests +import psutil +import uvicorn +from starlette.applications import Starlette +from starlette.responses import JSONResponse +from starlette.routing import Route + +from marker_main import ExtractionProc +from marker.utils import send_callback +import time +from datetime import datetime +import pytz +import os +from dotenv import load_dotenv + +load_dotenv() + +# 获取北京时区 +beijing_tz = pytz.timezone('Asia/Shanghai') + +# 全局变量和锁 +request_lock = threading.Lock() +stop_current_proc = Value("i", 0) + +# 日志配置 +logging.basicConfig( + level=logging.DEBUG, + format="%(asctime)s %(levelname)s: %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", +) + +# 信号处理函数 +def signal_handler(signum, frame): + logging.info(f"Received signal {signum}. Terminating all processes.") + terminate_children(os.getpid()) + sys.exit(0) + +# 终止所有子进程 +def terminate_children(pid): + try: + parent = psutil.Process(pid) + children = parent.children(recursive=True) + for child in children: + logging.info(f"Terminating process {child.pid}") + try: + child.terminate() + child.wait(timeout=5) + except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.TimeoutExpired): + pass + except psutil.NoSuchProcess: + pass + +# 任务队列和结果队列 - 拆分成两个队列 +regular_queue = multiprocessing.Queue() # 常规解析队列 +molecule_queue = multiprocessing.Queue() # 分子识别解析队列 +result_queue = multiprocessing.Queue() + +# 常规解析消费者进程 +def regular_document_proc( + regular_queue: multiprocessing.Queue, + result_queue: multiprocessing.Queue, + stop_current_proc: Value, +): + print("init regular parsing model...", flush=True) + extraction_proc = ExtractionProc() + extraction_proc.load_models() + print("regular parsing model inited", flush=True) + + while True: + print("Regular queue: Waiting for new task...") + params = regular_queue.get() + if params is None: # Shutdown signal + break + print("Starting regular extraction process") + + file = params.get("file", None) + args = params.get("args", {}) + file_type = params.get("file_type", "pdf") + docId = params.get("docId", "") + callback_url = params.get("callback_url", "") + skip_layout = params.get("skip_layout", False) + print('callback_url', callback_url, flush=True) + try: + print('start>>>regular extraction', flush=True) + print('file_type', file_type, type(file), flush=True) + if file_type.lower() == "pdf": + extraction_outputs = extraction_proc.extraction( + args, + file, + callback_url=callback_url, + docId=docId, + file_type=file_type, + mol_detect=False # 强制关闭分子识别 + ) + elif file_type.lower() == "docx": + # 直接转换DOCX到markdown,避免PDF转换的不准确性 + extraction_outputs = extraction_proc.parse_docx_direct(file) + elif file_type.lower() == "pptx": + # 将文件存储为临时文件,然后使用完整的extraction流程 + temp_file_path = os.path.join(tempfile.gettempdir(), f"{docId}.{file_type}") + try: + with open(temp_file_path, "wb") as f: + f.write(file) + + extraction_outputs = extraction_proc.extraction( + args, + temp_file_path, + callback_url=callback_url, + docId=docId, + file_type=file_type, + mol_detect=False + ) + except Exception as e: + print(traceback.format_exc()) + print('regular document process, parse pptx to pdf failed, parse pptx directly...') + extraction_outputs = extraction_proc.parse_pptx(file) + finally: + # 确保临时文件被清理 + if os.path.exists(temp_file_path): + os.remove(temp_file_path) + elif file_type == "jpg" or file_type == "png" or file_type == "jpeg": + # 将file存在临时文件里面,并提供path + temp_file_path = os.path.join(tempfile.gettempdir(), f"{docId}.{file_type}") + with open(temp_file_path, "wb") as f: + f.write(file) + + # 对于图片输入,根据skip_layout参数决定是否跳过layout布局检测,强制OCR检测 + args['force_ocr'] = True + args['force_layout_block'] = "Text" + print(f"[ImageProcessing] skip_layout=True, forcing OCR and setting layout block to Text") + + extraction_outputs = extraction_proc.extraction( + args, + temp_file_path, + callback_url=callback_url, + docId=docId, + file_type=file_type + ) + else: + raise Exception("Unsupported file type") + + result_queue.put({"docId": docId, "result": extraction_outputs}) + if callback_url: + time_str = datetime.now(beijing_tz).strftime("%H:%M:%S") + + # Handle dictionary format + if isinstance(extraction_outputs, dict): + markdown_text = extraction_outputs.get('text', '') + metadata = extraction_outputs.get('metadata', {}) + info = extraction_outputs.get('info', {}) + images = extraction_outputs.get('images', {}) + mol_images = extraction_outputs.get('mol_images', {}) + table_contents = extraction_outputs.get('table_contents', {}) + table_count = info.get('table_count', 0) + formula_count = info.get('formula_count', 0) + ocr_count = info.get('ocr_count', 0) + else: + # Backward compatibility with tuple format + markdown_text = extraction_outputs[0] if len(extraction_outputs) > 0 else '' + metadata = extraction_outputs[3] if len(extraction_outputs) > 3 else {} + info = extraction_outputs[2] if len(extraction_outputs) > 2 else {} + images = {} + mol_images = {} + table_contents = {} + table_count = info.get('table_count', 0) if file_type == "pdf" else 0 + formula_count = info.get('formula_count', 0) if file_type == "pdf" else 0 + ocr_count = info.get('ocr_count', 0) if file_type == "pdf" else 0 + + print('metadata', metadata, flush=True) + send_callback(callback_url, { + 'status': True, + 'messages': 'success', + 'markdown': markdown_text, + 'metadata': json.dumps(metadata), + 'images': json.dumps(images), + 'mol_images': json.dumps(mol_images), + 'table_contents': json.dumps(table_contents), + 'docId': docId, + 'progress': 95, + 'progress_text': '开始chunking和embedding\ntable数量 ' + str(table_count) + ' 公式数量 ' + str(formula_count) + ' ocr次数 ' + str(ocr_count) + ' ' + time_str + }) + except Exception as e: + traceback.print_exc() + result_queue.put({ + "docId": docId, + "markdown": ' ', + "metadata": json.dumps({}), + 'images': json.dumps({}), + 'mol_images': json.dumps({}), + 'table_contents': json.dumps({}), + 'status': False, + 'messages': 'success' + }) + if callback_url: + time_str = datetime.now(beijing_tz).strftime("%H:%M:%S") + send_callback(callback_url, { + 'status': False, + 'messages': 'error' + str(e), + 'docId': docId, + 'progress': 95, + 'progress_text': 'error' + str(e) + }) + finally: + stop_current_proc.value = 0 + +# 分子识别消费者进程 +def molecule_document_proc( + molecule_queue: multiprocessing.Queue, + result_queue: multiprocessing.Queue, + stop_current_proc: Value, +): + print("init molecule detection model...", flush=True) + extraction_proc = ExtractionProc() + extraction_proc.load_models() + print("molecule detection model inited", flush=True) + + while True: + print("Molecule queue: Waiting for new task...") + params = molecule_queue.get() + if params is None: # Shutdown signal + break + print("Starting molecule detection extraction process") + + file = params.get("file", None) + args = params.get("args", {}) + file_type = params.get("file_type", "pdf") + docId = params.get("docId", "") + callback_url = params.get("callback_url", "") + skip_layout = params.get("skip_layout", False) + print('callback_url', callback_url, flush=True) + try: + print('start>>>molecule detection extraction', flush=True) + print('file_type', file_type, type(file), flush=True) + if file_type.lower() == "pdf": + extraction_outputs = extraction_proc.extraction( + args, + file, + callback_url=callback_url, + docId=docId, + file_type=file_type, + mol_detect=True # 强制开启分子识别 + ) + elif file_type.lower() == "docx": + # 直接转换DOCX到markdown,避免PDF转换的不准确性 + extraction_outputs = extraction_proc.parse_docx_direct(file) + elif file_type.lower() == "pptx": + # 将文件存储为临时文件,然后使用完整的extraction流程 + temp_file_path = os.path.join(tempfile.gettempdir(), f"{docId}.{file_type}") + try: + with open(temp_file_path, "wb") as f: + f.write(file) + + extraction_outputs = extraction_proc.extraction( + args, + temp_file_path, + callback_url=callback_url, + docId=docId, + file_type=file_type, + mol_detect=True + ) + except Exception as e: + print(traceback.format_exc()) + print('molecule_document proess parse pptx to pdf failed, parse pptx directly...') + extraction_outputs = extraction_proc.parse_pptx(file) + finally: + # 确保临时文件被清理 + if os.path.exists(temp_file_path): + os.remove(temp_file_path) + elif file_type == "jpg" or file_type == "png" or file_type == "jpeg": + # 将file存在临时文件里面,并提供path + temp_file_path = os.path.join(tempfile.gettempdir(), f"{docId}.{file_type}") + with open(temp_file_path, "wb") as f: + f.write(file) + + # 对于图片输入,根据skip_layout参数决定是否跳过layout布局检测,强制OCR检测 + args['force_ocr'] = True + args['force_layout_block'] = "Text" + print(f"[ImageProcessing] skip_layout=True, forcing OCR and setting layout block to Text") + + extraction_outputs = extraction_proc.extraction( + args, + temp_file_path, + callback_url=callback_url, + docId=docId, + file_type=file_type + ) + else: + raise Exception("Unsupported file type") + + result_queue.put({"docId": docId, "result": extraction_outputs}) + if callback_url: + time_str = datetime.now(beijing_tz).strftime("%H:%M:%S") + + # Handle dictionary format + if isinstance(extraction_outputs, dict): + markdown_text = extraction_outputs.get('text', '') + metadata = extraction_outputs.get('metadata', {}) + info = extraction_outputs.get('info', {}) + images = extraction_outputs.get('images', {}) + mol_images = extraction_outputs.get('mol_images', {}) + table_contents = extraction_outputs.get('table_contents', {}) + table_count = info.get('table_count', 0) + formula_count = info.get('formula_count', 0) + ocr_count = info.get('ocr_count', 0) + else: + # Backward compatibility with tuple format + markdown_text = extraction_outputs[0] if len(extraction_outputs) > 0 else '' + metadata = extraction_outputs[3] if len(extraction_outputs) > 3 else {} + info = extraction_outputs[2] if len(extraction_outputs) > 2 else {} + images = {} + mol_images = {} + table_contents = {} + table_count = info.get('table_count', 0) if file_type == "pdf" else 0 + formula_count = info.get('formula_count', 0) if file_type == "pdf" else 0 + ocr_count = info.get('ocr_count', 0) if file_type == "pdf" else 0 + + print('metadata', metadata, flush=True) + send_callback(callback_url, { + 'status': True, + 'messages': 'success', + 'markdown': markdown_text, + 'metadata': json.dumps(metadata), + 'images': json.dumps(images), + 'mol_images': json.dumps(mol_images), + 'table_contents': json.dumps(table_contents), + 'docId': docId, + 'progress': 95, + 'progress_text': '开始chunking和embedding\ntable数量 ' + str(table_count) + ' 公式数量 ' + str(formula_count) + ' ocr次数 ' + str(ocr_count) + ' ' + time_str + }) + except Exception as e: + traceback.print_exc() + result_queue.put({ + "docId": docId, + "markdown": ' ', + "metadata": json.dumps({}), + 'images': json.dumps({}), + 'mol_images': json.dumps({}), + 'table_contents': json.dumps({}), + 'status': False, + 'messages': 'success' + }) + if callback_url: + time_str = datetime.now(beijing_tz).strftime("%H:%M:%S") + send_callback(callback_url, { + 'status': False, + 'messages': 'error' + str(e), + 'docId': docId, + 'progress': 95, + 'progress_text': 'error' + str(e) + }) + finally: + stop_current_proc.value = 0 + + +# 响应类 +class ExtractionResponse(NamedTuple): + status_code: int + success: bool + msg: str + +# 生成响应 +def do_response(resp: ExtractionResponse): + return JSONResponse(resp._asdict()) + +# 处理文档提取请求 +async def document_extract(request): + try: + form = await request.form() + docId = form.get("docId", "") + file_type = form.get("file_type", "pdf") # pdf, docx, pptx, jpg, png, jpeg + callback_url = form.get("callback_url", "") + mol_detect = form.get("mol_detect", "False") == 'True' + skip_layout = form.get("skip_layout", "False") == 'True' # 新增参数:是否跳过layout检测 + print('mol_detect', form.get("mol_detect", ""), mol_detect, flush=True) + print('skip_layout', form.get("skip_layout", ""), skip_layout, flush=True) + + if not docId: + return do_response( + ExtractionResponse(100, False, "No docId provided in request") + ) + + file = form.get("file", None) + + if file: + file_content = await file.read() # 读取文件内容 + else: + file_content = None + + args = json.loads(form.get("args", "{}")) + args['workers'] = 5 + + extra = json.loads(form.get("extra", "{}")) + is_testing = extra.get("is_testing", False) + + params = { + "file": file_content, + "file_type": file_type, + "args": args, + "docId": docId, + "callback_url": callback_url, + "mol_detect": mol_detect, + "skip_layout": skip_layout, + } + + if mol_detect: + molecule_queue.put(params) + else: + regular_queue.put(params) + + return do_response(ExtractionResponse(200, True, "Task accepted")) + except Exception as e: + traceback.print_exc() + return do_response( + ExtractionResponse(102, False, f"Exception catch, cause {e!r}") + ) + +# Ping响应 +async def ping_resp(request): + return JSONResponse({"response": "pong"}) + +# Starlette应用 +app = Starlette( + debug=True, + routes=[ + Route("/api/v1/document_extract", document_extract, methods=["POST"]), + Route("/api/ping", ping_resp, methods=["GET"]), + ], +) + +# 主函数 +def main(): + # 注册信号处理函数 + signal.signal(signal.SIGINT, signal_handler) + signal.signal(signal.SIGTERM, signal_handler) + + # 启动消费者进程 + processes = [] + + # 启动4个常规解析消费者进程(处理速度快,需要更多并发) + for _ in range(2): + process = multiprocessing.Process( + target=regular_document_proc, + args=( + regular_queue, + result_queue, + stop_current_proc, + ), + ) + process.start() + processes.append(process) + + # 启动2个分子识别消费者进程(处理速度慢,占用资源多,减少并发数) + for _ in range(1): + process = multiprocessing.Process( + target=molecule_document_proc, + args=( + molecule_queue, + result_queue, + stop_current_proc, + ), + ) + process.start() + processes.append(process) + + # 启动Uvicorn服务器 + try: + uvicorn.run(app, host=os.getenv("HOST"), port=int(os.getenv("PORT"))) + finally: + # Shutdown consumers + for _ in range(2): # 关闭常规解析进程 + regular_queue.put(None) + for _ in range(1): # 关闭分子识别进程 + molecule_queue.put(None) + for process in processes: + process.join() + +if __name__ == "__main__": + main() diff --git a/start.sh b/start.sh new file mode 100755 index 00000000..32b4beb1 --- /dev/null +++ b/start.sh @@ -0,0 +1,4 @@ +docker stop gpt_marker +docker rm gpt_marker + +docker run -d -p 8089:8080 -v ./:/app -v /home/luohao/.cache/huggingface:/root/.cache/huggingface -v /home/luohao/.cache/datalab:/root/.cache/datalab --gpus '"device=3"' --name gpt_marker gpt_marker:v3 ./app.sh diff --git a/test_molecule_extraction.py b/test_molecule_extraction.py new file mode 100644 index 00000000..b7cbd9a0 --- /dev/null +++ b/test_molecule_extraction.py @@ -0,0 +1,591 @@ +#!/usr/bin/env python3 +""" +分子识别extraction测试脚本 + +测试marker_main.ExtractionProc的extraction方法,包含分子识别功能 +""" + +import os +import sys +import time +import json +from pathlib import Path +import argparse + +# 添加项目路径 +sys.path.insert(0, '/home/luohao/codes/gpt-marker') + +from marker_main import ExtractionProc +from marker.models import create_model_dict + + +def create_test_args_with_molecule_detection( + output_dir: str = "./test_output", + debug: bool = True, + force_ocr: bool = False, + use_molecule_detection: bool = True, + processor_config: dict = None, + use_mock_data: bool = True # 默认使用mock数据 +): + """ + 创建包含分子识别的测试参数 + + Args: + output_dir: 输出目录 + debug: 是否启用调试模式 + force_ocr: 是否强制OCR + use_molecule_detection: 是否启用分子检测 + processor_config: img2mol processor配置 + use_mock_data: 是否使用mock数据 + + Returns: + 配置好的参数字典 + """ + + # 默认分子识别配置 + default_processor_config = { + "device": "cuda", + "with_mol_detect": True, + "with_table_detect": True, + "use_yolo_mol_model": True, + "use_yolo_table_model": True, + "use_yolo_table_model_v2": True, + "debug": debug, + "num_workers": 1, + "padding": 0, + "use_mock_data": use_mock_data, + "mock_mode": use_mock_data + } + + if processor_config: + default_processor_config.update(processor_config) + + # 创建config_json,包含分子检测配置 + config_json = { + "use_molecule_detection": use_molecule_detection, + "use_llm": False, # 可以根据需要启用 + "processor_config": default_processor_config if use_molecule_detection else {} + } + args = { + 'output_dir': output_dir, + 'debug': debug, + 'output_format': 'markdown', + 'page_range': None, + 'force_ocr': force_ocr, + 'processors': None, + 'config_json': config_json, # 现在是JSON字符串 + 'languages': None, + 'disable_multiprocessing': False, + 'paginate_output': False, + 'disable_image_extraction': False, + } + + return args + + +def test_molecule_extraction( + pdf_path: str, + output_dir: str = "./test_output", + callback_url: str = "", + doc_id: str = "test_doc", + processor_config: dict = None +): + """ + 测试分子识别extraction功能 + + Args: + pdf_path: PDF文件路径 + output_dir: 输出目录 + callback_url: 回调URL(可选) + doc_id: 文档ID + processor_config: processor配置 + + Returns: + 提取结果 + """ + + # 检查文件是否存在 + if not os.path.exists(pdf_path): + raise FileNotFoundError(f"PDF文件不存在: {pdf_path}") + + # 创建输出目录 + os.makedirs(output_dir, exist_ok=True) + + print(f"🧪 开始测试分子识别extraction功能") + print(f"📄 PDF文件: {pdf_path}") + print(f"📁 输出目录: {output_dir}") + print(f"🆔 文档ID: {doc_id}") + print("-" * 60) + + # 初始化ExtractionProc + print("⚙️ 初始化ExtractionProc...") + proc = ExtractionProc() + + # 加载模型 + print("🤖 加载模型...") + proc.load_models() + + # 如果启用分子检测,需要在model_dict中添加processor_config + if processor_config: + if not hasattr(proc, 'model_dict'): + proc.model_dict = create_model_dict() + proc.model_dict["processor_config"] = processor_config + + # 读取PDF文件 + print("📖 读取PDF文件...") + with open(pdf_path, 'rb') as f: + file_byte = f.read() + + # 创建测试参数 + print("🔧 配置分子识别参数...") + args = create_test_args_with_molecule_detection( + output_dir=output_dir, + debug=True, + processor_config=processor_config + ) + + print("🔍 分子识别配置:") + config_json = args['config_json'] + print(f" - 启用分子检测: {config_json.get('use_molecule_detection', False)}") + print(f" - 启用LLM: {config_json.get('use_llm', False)}") + if 'processor_config' in config_json: + pc = config_json['processor_config'] + print(f" - 设备: {pc.get('device', 'unknown')}") + print(f" - 分子检测: {pc.get('with_mol_detect', False)}") + print(f" - 表格检测: {pc.get('with_table_detect', False)}") + + print("-" * 60) + + # 开始extraction + print("🚀 开始extraction...") + start_time = time.time() + + try: + result = proc.extraction( + args=args, + file_byte=file_byte, + callback_url=callback_url, + docId=doc_id, + file_type='pdf' + ) + + end_time = time.time() + processing_time = end_time - start_time + + print("✅ Extraction完成!") + print(f"⏱️ 处理时间: {processing_time:.2f}秒") + + # Handle dictionary format + if isinstance(result, dict): + full_text = result.get('text', '') + info = result.get('info', {}) + metadata = result.get('metadata', {}) + image_mappings = result.get('image_mappings', {}) + table_contents = result.get('table_contents', {}) + else: + # Backward compatibility with tuple format + if len(result) >= 4: + full_text, _, info, metadata = result[:4] + image_mappings = result[4] if len(result) > 4 else {} + table_contents = result[5] if len(result) > 5 else {} + else: + print(f"❌ Unexpected result format: {len(result)} items") + return {'success': False, 'error': 'Unexpected result format'} + + # 分析结果 + print("\n📊 提取结果分析:") + print(f" - 文本长度: {len(full_text)} 字符") + print(f" - 表格数量: {info.get('table_count', 0)}") + print(f" - 公式数量: {info.get('formula_count', 0)}") + print(f" - OCR页面数: {info.get('ocr_count', 0)}") + print(f" - 图片映射数量: {len(image_mappings)}") + print(f" - 表格内容数量: {len(table_contents)}") + + # 分子识别结果分析 - 检查新的标签格式 + mol_count = full_text.count(' 0 or mol_table_count > 0: + print("\n🧬 分子识别示例输出:") + lines = full_text.split('\n') + for i, line in enumerate(lines): + if ' 200 else html_table) + + # 验证包含C1CCCCC1等分子 + if 'C1CCCCC1' in html_table or 'c1ccccc1' in html_table: + print("✅ 包含目标分子SMILES") + else: + print("❌ 未包含目标分子SMILES") + + print("\n" + "=" * 60) + print("✅ 分子表格Mock数据生成测试完成") + + except Exception as e: + print(f"❌ 测试失败: {e}") + import traceback + traceback.print_exc() + + +def main(): + """主函数 - 运行测试""" + + print("🧪 分子识别Extraction测试脚本") + print("=" * 60) + + # 检查命令行参数 + use_real_model = '--real' in sys.argv # 使用 --real 参数启用真实模型 + use_mock_data = not use_real_model # 默认使用mock模式 + + # 测试配置 + test_config = { + 'pdf_path': 'data/molecule.pdf', # 请修改为实际路径 + 'output_dir': './molecule_test_output', + 'doc_id': 'chemistry_test', + 'processor_config': { + 'device': 'cuda', # 或 'cpu' + 'with_mol_detect': True, + 'with_table_detect': True, + 'use_yolo_mol_model': True, + 'use_yolo_table_model': True, + 'use_yolo_table_model_v2': True, + 'debug': True, + 'num_workers': 1, + 'padding': 0, + # 添加更多img2mol配置... + } + } + + # 显示运行模式 + if use_mock_data: + print("🎭 运行模式: Mock测试模式 (快速测试)") + print(" - 使用 --real 参数启用真实模型测试") + else: + print("🔬 运行模式: 真实模型测试") + + print("-" * 60) + + # 检查PDF文件 + pdf_path = test_config['pdf_path'] + if not os.path.exists(pdf_path): + print(f"❌ 测试PDF文件不存在: {pdf_path}") + print("\n📝 使用说明:") + print("1. 请将test_config['pdf_path']设置为实际的PDF文件路径") + print("2. Mock测试: python test_molecule_extraction.py") + print("3. 真实模型测试: python test_molecule_extraction.py --real") + return + + # 运行测试 + try: + # 修改test_molecule_extraction函数调用,添加use_mock_data参数 + result = test_molecule_extraction_with_mock(**test_config, use_mock_data=use_mock_data) + + if result['success']: + print(f"\n🎉 测试成功完成!") + print(f"📄 输出文件: {result['output_file']}") + print(f"🧬 识别到 {result['mol_count']} 个分子结构") + print(f"📊 识别到 {result['mol_table_count']} 个分子表格") + print(f"⏱️ 总处理时间: {result['processing_time']:.2f}秒") + + if result['mol_count'] > 0 or result['mol_table_count'] > 0: + print("\n✨ 分子识别功能工作正常!") + if use_mock_data: + print("🎭 Mock模式测试通过,可以尝试使用 --real 参数测试真实模型") + else: + print("\n⚠️ 未检测到分子内容,可能的原因:") + print(" - PDF中没有化学分子结构") + print(" - 分子检测模型未正确加载") + print(" - 图像质量不足") + else: + print(f"\n❌ 测试失败: {result['error']}") + + except Exception as e: + print(f"\n💥 测试脚本执行失败: {str(e)}") + import traceback + traceback.print_exc() + + +def test_molecule_extraction_with_mock( + pdf_path: str, + output_dir: str = "./test_output", + callback_url: str = "", + doc_id: str = "test_doc", + processor_config: dict = None, + use_mock_data: bool = True +): + """ + 带mock支持的分子识别extraction测试 + """ + + # 检查文件是否存在 + if not os.path.exists(pdf_path): + raise FileNotFoundError(f"PDF文件不存在: {pdf_path}") + + # 创建输出目录 + os.makedirs(output_dir, exist_ok=True) + + print(f"🧪 开始测试分子识别extraction功能") + print(f"📄 PDF文件: {pdf_path}") + print(f"📁 输出目录: {output_dir}") + print(f"🆔 文档ID: {doc_id}") + print(f"🎭 Mock模式: {'启用' if use_mock_data else '禁用'}") + print("-" * 60) + + # 初始化ExtractionProc + print("⚙️ 初始化ExtractionProc...") + proc = ExtractionProc() + + # 加载模型 + print("🤖 加载模型...") + proc.load_models() + + # 如果启用分子检测,需要在model_dict中添加processor_config + if processor_config: + if not hasattr(proc, 'model_dict'): + proc.model_dict = create_model_dict() + proc.model_dict["processor_config"] = processor_config + + # 读取PDF文件 + print("📖 读取PDF文件...") + with open(pdf_path, 'rb') as f: + file_byte = f.read() + + # 创建测试参数 + print("🔧 配置分子识别参数...") + args = create_test_args_with_molecule_detection( + output_dir=output_dir, + debug=True, + processor_config=processor_config, + use_mock_data=use_mock_data + ) + + print("🔍 分子识别配置:") + config_json = args['config_json'] + + print(f" - 启用分子检测: {config_json.get('use_molecule_detection', False)}") + print(f" - 启用LLM: {config_json.get('use_llm', False)}") + if 'processor_config' in config_json: + pc = config_json['processor_config'] + print(f" - 设备: {pc.get('device', 'unknown')}") + print(f" - 分子检测: {pc.get('with_mol_detect', False)}") + print(f" - 表格检测: {pc.get('with_table_detect', False)}") + print(f" - Mock模式: {pc.get('use_mock_data', False)}") + + print("-" * 60) + + # 开始extraction + print("🚀 开始extraction...") + start_time = time.time() + + try: + result = proc.extraction( + args=args, + file_byte=file_byte, + callback_url=callback_url, + docId=doc_id, + file_type='pdf' + ) + + end_time = time.time() + processing_time = end_time - start_time + + print("✅ Extraction完成!") + print(f"⏱️ 处理时间: {processing_time:.2f}秒") + + # Handle dictionary format + if isinstance(result, dict): + full_text = result.get('text', '') + info = result.get('info', {}) + metadata = result.get('metadata', {}) + image_mappings = result.get('image_mappings', {}) + table_contents = result.get('table_contents', {}) + else: + # Backward compatibility with tuple format + if len(result) >= 4: + full_text, _, info, metadata = result[:4] + image_mappings = result[4] if len(result) > 4 else {} + table_contents = result[5] if len(result) > 5 else {} + else: + print(f"❌ Unexpected result format: {len(result)} items") + return {'success': False, 'error': 'Unexpected result format'} + + # 分析结果 + print("\n📊 提取结果分析:") + print(f" - 文本长度: {len(full_text)} 字符") + print(f" - 表格数量: {info.get('table_count', 0)}") + print(f" - 公式数量: {info.get('formula_count', 0)}") + print(f" - OCR页面数: {info.get('ocr_count', 0)}") + print(f" - 图片映射数量: {len(image_mappings)}") + print(f" - 表格内容数量: {len(table_contents)}") + + # 分子识别结果分析 - 检查新的标签格式 + mol_count = full_text.count(' 0 or mol_table_count > 0: + print("\n🧬 分子识别示例输出:") + lines = full_text.split('\n') + for i, line in enumerate(lines): + if '