Skip to content

Commit 76f273e

Browse files
committed
Add new url source type
Addresses #3
1 parent 30fa82f commit 76f273e

File tree

7 files changed

+251
-1
lines changed

7 files changed

+251
-1
lines changed

bun.lockb

1.05 KB
Binary file not shown.

package.json

+3-1
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,8 @@
4040
"react-hotkeys-hook": "^4.5.1",
4141
"react-markdown": "^9.0.1",
4242
"remark-breaks": "^4.0.0",
43-
"remark-gfm": "^4.0.0"
43+
"remark-gfm": "^4.0.0",
44+
"turndown": "^7.2.0"
4445
},
4546
"devDependencies": {
4647
"@happy-dom/global-registrator": "^15.7.4",
@@ -54,6 +55,7 @@
5455
"@types/papaparse": "^5.3.14",
5556
"@types/react": "^18.3.10",
5657
"@types/react-dom": "^18.3.0",
58+
"@types/turndown": "^5.0.5",
5759
"@typescript-eslint/eslint-plugin": "^8.7.0",
5860
"@typescript-eslint/parser": "^8.7.0",
5961
"@webgpu/types": "^0.1.46",

src/source/Source.ts

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ export class SourceTypes {
55
static TEXT = 'text' as const;
66
static DOCS = 'docs' as const;
77
static FILE = 'file' as const;
8+
static URL = 'url' as const;
89
}
910

1011
type SourceTypeList = (typeof SourceTypes)[keyof typeof SourceTypes];

src/source/sources.ts

+2
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,11 @@ import { DocsListSource } from './docsList/docsList';
22
import { FileSource } from './file/fileSource';
33
import { SourceTypes } from './Source';
44
import { TextSource } from './text/textSource';
5+
import { UrlSource } from './url/urlSource';
56

67
export const sourceProviders = {
78
[SourceTypes.DOCS]: DocsListSource,
89
[SourceTypes.TEXT]: TextSource,
910
[SourceTypes.FILE]: FileSource,
11+
[SourceTypes.URL]: UrlSource,
1012
};

src/source/url/URLSourceRender.tsx

+192
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,192 @@
1+
import type { Doc } from '@/src/doc/Document';
2+
import type { SourceStep } from '@/src/step/Step';
3+
import { Field } from '@headlessui/react';
4+
import { MagnifyingGlassIcon } from '@heroicons/react/24/solid';
5+
import {
6+
type ChangeEvent,
7+
useCallback,
8+
useEffect,
9+
useMemo,
10+
useRef,
11+
useState,
12+
} from 'react';
13+
import TurndownService from 'turndown';
14+
import { Button } from '~/components/catalyst/button';
15+
import { Dialog, DialogBody, DialogTitle } from '~/components/catalyst/dialog';
16+
import { Label } from '~/components/catalyst/fieldset';
17+
import { Input } from '~/components/catalyst/input';
18+
import { CustomMarkdown } from '~/components/markdown/Markdown';
19+
import { Spinner } from '~/components/Spinner';
20+
import type { URLSourceConfig } from './types';
21+
22+
// init turndown service
23+
const turndownService = new TurndownService();
24+
25+
/**
26+
* Fetches the HTML content of a page and converts it to markdown.
27+
* @param url - The URL of the page to fetch.
28+
* @returns The markdown representation of the page's content.
29+
*/
30+
async function fetchPageAsMarkdown(url: string): Promise<string> {
31+
try {
32+
// Fetch the page's HTML content
33+
const response = await fetch(url);
34+
if (!response.ok) {
35+
throw new Error(`Failed to fetch page: ${response.statusText}`);
36+
}
37+
const htmlText = await response.text();
38+
39+
// Convert HTML to markdown using Turndown
40+
const markdown = turndownService
41+
.remove('script')
42+
.remove('link')
43+
.turndown(htmlText);
44+
45+
return markdown;
46+
} catch (error) {
47+
console.error('Error fetching or converting page:', error);
48+
throw error;
49+
}
50+
}
51+
52+
export function URLSourceRender({
53+
source,
54+
setSource,
55+
}: {
56+
source: SourceStep;
57+
setSource: (newSource: SourceStep) => void;
58+
}) {
59+
const fetchRef = useRef<Timer>();
60+
const corsProxyRef = useRef<HTMLInputElement>(null);
61+
const [loading, setLoading] = useState(false);
62+
const [isPreviewOpen, setPreviewOpen] = useState(false);
63+
const config = useMemo(() => source.config as URLSourceConfig, [source]);
64+
65+
const updateConfig = useCallback(
66+
(config: URLSourceConfig) => {
67+
const newSource = structuredClone(source);
68+
newSource.config = config;
69+
setSource(newSource);
70+
},
71+
[source, setSource]
72+
);
73+
74+
const updateDocFromUrl = (e: ChangeEvent<HTMLInputElement>) => {
75+
const newUrl = e.target.value;
76+
77+
// update url
78+
const newConfig = structuredClone(config) as URLSourceConfig;
79+
newConfig.url = newUrl;
80+
updateConfig(newConfig);
81+
82+
// reset timeout
83+
if (fetchRef.current) {
84+
clearTimeout(fetchRef.current);
85+
}
86+
// start new timeout to fetch content
87+
fetchRef.current = setTimeout(async () => {
88+
try {
89+
if (!newUrl.length) {
90+
return;
91+
}
92+
93+
setLoading(true);
94+
// fetch content from url
95+
const corsProxy = corsProxyRef.current?.value ?? '';
96+
const url = `${corsProxy}${newUrl}`;
97+
const content = await fetchPageAsMarkdown(url);
98+
99+
// update doc content
100+
if (!newConfig.document) {
101+
const doc: Doc = {
102+
id: 'urldoc',
103+
name: 'Default document',
104+
content: '',
105+
test: true,
106+
processingResults: [],
107+
};
108+
newConfig.document = doc;
109+
}
110+
newConfig.url = '';
111+
newConfig.document.name = newUrl;
112+
newConfig.document.content = content;
113+
updateConfig(newConfig);
114+
} catch (err) {
115+
console.error('error fetching doc:', err);
116+
}
117+
118+
setLoading(false);
119+
}, 1000);
120+
};
121+
122+
useEffect(() => {
123+
if (config.document) {
124+
return;
125+
}
126+
127+
const doc: Doc = {
128+
id: 'urldoc',
129+
name: 'Default document',
130+
content: '',
131+
test: true,
132+
processingResults: [],
133+
};
134+
const newConfig = structuredClone(config) as URLSourceConfig;
135+
newConfig.document = doc;
136+
updateConfig(newConfig);
137+
}, [updateConfig, config, source]);
138+
139+
return (
140+
<div className="flex flex-col w-full h-full overflow-auto">
141+
<div className="flex flex-col gap-1.5">
142+
<Field className="grid grid-cols-[40px_minmax(0,_1fr)] items-center justify-center gap-6">
143+
<Label>URL:</Label>
144+
<Input
145+
name="url"
146+
placeholder="Document URL"
147+
value={config.url ?? ''}
148+
onChange={updateDocFromUrl}
149+
disabled={loading}
150+
/>
151+
</Field>
152+
<Field className="grid grid-cols-[40px_minmax(0,_1fr)] items-center justify-center gap-6">
153+
<Label>CORS:</Label>
154+
<Input
155+
name="corsproxy"
156+
placeholder="CORS proxy URL (optional)"
157+
defaultValue="https://corsproxy.io/?"
158+
ref={corsProxyRef}
159+
disabled={loading}
160+
/>
161+
</Field>
162+
{!loading && Boolean(config.document?.content?.length) && (
163+
<div className="p-2 flex items-center justify-between bg-zinc-100 dark:bg-zinc-900 rounded-lg shadow-sm">
164+
{config.document?.name}
165+
<div className="flex items-center gap-2 min-w-fit">
166+
<Button icon title="Preview" onClick={() => setPreviewOpen(true)}>
167+
<MagnifyingGlassIcon />
168+
</Button>
169+
</div>
170+
</div>
171+
)}
172+
{loading && (
173+
<div className="flex items-center justify-center p-2 gap-2">
174+
<Spinner className="h-5 w-5" /> Fetching content..
175+
</div>
176+
)}
177+
</div>
178+
179+
<Dialog
180+
open={isPreviewOpen}
181+
onClose={() => setPreviewOpen(false)}
182+
size="3xl"
183+
topClassName="z-20"
184+
>
185+
<DialogTitle>Preview: {config.document?.name}</DialogTitle>
186+
<DialogBody className="prose dark:prose-invert max-w-full">
187+
<CustomMarkdown>{config.document?.content}</CustomMarkdown>
188+
</DialogBody>
189+
</Dialog>
190+
</div>
191+
);
192+
}

src/source/url/types.ts

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
import { Doc } from '@/src/doc/Document';
2+
3+
export type URLSourceConfig = {
4+
url?: string;
5+
document?: Doc;
6+
};

src/source/url/urlSource.ts

+47
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
import { Doc } from '@/src/doc/Document';
2+
import { SourceStep } from '@/src/step/Step';
3+
import React from 'react';
4+
import { SourceProvider, SourceTypes } from '../Source';
5+
import { URLSourceRender } from './URLSourceRender';
6+
import type { URLSourceConfig } from './types';
7+
8+
export class UrlSource implements SourceProvider {
9+
source: SourceStep;
10+
11+
constructor(source: SourceStep) {
12+
this.source = source;
13+
}
14+
15+
async getDocs(): Promise<Doc[]> {
16+
if (this.source.sourceType !== SourceTypes.URL) {
17+
throw new Error(
18+
'Not URL source passed when trying to get URL source doc!'
19+
);
20+
}
21+
22+
const config = this.source.config as URLSourceConfig;
23+
const docs = [config.document!];
24+
return docs;
25+
}
26+
27+
async setDocs(docs: Doc[]) {
28+
if (this.source.sourceType !== SourceTypes.URL) {
29+
throw new Error(
30+
'Not URL doc source passed when trying to set URL source doc!'
31+
);
32+
}
33+
34+
(this.source.config as URLSourceConfig).document = docs[0];
35+
return this.source;
36+
}
37+
38+
render({
39+
source,
40+
setSource,
41+
}: {
42+
source: SourceStep;
43+
setSource: (newSource: SourceStep) => void;
44+
}) {
45+
return React.createElement(URLSourceRender, { source, setSource });
46+
}
47+
}

0 commit comments

Comments
 (0)