11import { convert } from "libreoffice-convert" ;
22import { fromPath } from "pdf2pic" ;
3- import { LLMParams } from "./types" ;
3+ import {
4+ ConvertedNodeType ,
5+ LLMParams ,
6+ MdNodeType ,
7+ ParentId ,
8+ ProcessedNode ,
9+ } from "./types" ;
410import { pipeline } from "stream/promises" ;
511import { promisify } from "util" ;
612import * as Tesseract from "tesseract.js" ;
@@ -313,18 +319,13 @@ export const convertKeysToSnakeCase = (
313319 ) ;
314320} ;
315321
316- interface ProcessedNode {
317- id : string ;
318- parentId : string | undefined ;
319- type : string ;
320- value : any ;
321- }
322- interface parentId {
323- id : string ;
324- depth : number ;
325- }
326-
327- export const markdownToJson = async ( markdownString : string ) => {
322+ /**
323+ *
324+ * @param markdownString String - Markdown text
325+ * @param page Number - Page number
326+ * @returns ProcessedNode[] - Array of processed nodes
327+ */
328+ export const markdownToJson = async ( markdownString : string , page : number ) => {
328329 /**
329330 * Bypassing typescript transpiler using eval to use dynamic imports
330331 *
@@ -341,83 +342,99 @@ export const markdownToJson = async (markdownString: string) => {
341342
342343 console . log ( JSON . stringify ( parsedMd ) ) ;
343344
344- const parentIdManager : parentId [ ] = [ ] ;
345+ const parentIdManager : ParentId [ ] = [ ] ;
345346
346- const jsonObj : ProcessedNode [ ] = [ ] ;
347- parsedMd . children . forEach ( ( node : any ) => {
348- const isHeading = node . type === " heading" ;
347+ const processedNodes : ProcessedNode [ ] = [ ] ;
348+ parsedMd . children . forEach ( ( sourceNode : any ) => {
349+ const isHeading = sourceNode . type === MdNodeType . heading ;
349350
350- if ( isHeading && node . depth <= ( parentIdManager . at ( - 1 ) ?. depth || 0 ) ) {
351+ if ( isHeading && sourceNode . depth <= ( parentIdManager . at ( - 1 ) ?. depth || 0 ) ) {
351352 for ( let i = parentIdManager . length ; i > 0 ; i -- ) {
352353 parentIdManager . pop ( ) ;
353- if ( node . depth > ( parentIdManager . at ( - 1 ) ?. depth || 0 ) ) {
354+ if ( sourceNode . depth > ( parentIdManager . at ( - 1 ) ?. depth || 0 ) ) {
354355 break ;
355356 }
356357 }
357358 }
358- const processedNode = processNode ( node , parentIdManager . at ( - 1 ) ?. id ) ;
359+ const processedNode = processNode (
360+ sourceNode ,
361+ page ,
362+ parentIdManager . at ( - 1 ) ?. id
363+ ) ;
359364
360365 if ( isHeading ) {
361- parentIdManager . push ( { id : processedNode [ 0 ] . id , depth : node . depth } ) ;
366+ parentIdManager . push ( {
367+ id : processedNode [ 0 ] . id ,
368+ depth : sourceNode . depth ,
369+ } ) ;
362370 }
363371
364- jsonObj . push ( ...processedNode ) ;
372+ processedNodes . push ( ...processedNode ) ;
365373 } ) ;
366374
367- return jsonObj ;
368- } ;
369-
370- const type : Record < string , string > = {
371- heading : "heading" ,
372- text : "text" ,
373- list : "list" ,
375+ return processedNodes ;
374376} ;
375377
376- const processNode = ( node : any , parentId ?: string ) : ProcessedNode [ ] => {
378+ const processNode = (
379+ node : any ,
380+ page : number ,
381+ parentId ?: string
382+ ) : ProcessedNode [ ] => {
377383 let value : any ;
378384 let siblingNodes : ProcessedNode [ ] = [ ] ;
379385
380- if ( node . type === "heading" ) {
381- value = node . children
382- . map ( ( childNode : any ) => processText ( childNode ) )
383- . join ( " " ) ;
384- } else if ( node . type === "paragraph" ) {
386+ if (
387+ node . type === MdNodeType . heading ||
388+ node . type === MdNodeType . paragraph ||
389+ node . type === MdNodeType . strong
390+ ) {
385391 value = node . children
386392 . map ( ( childNode : any ) => processText ( childNode ) )
387393 . join ( " " ) ;
388- } else if ( node . type === " list" ) {
394+ } else if ( node . type === MdNodeType . list ) {
389395 const processedNodes = node . children . map ( ( childNode : any ) =>
390- processListItem ( childNode )
396+ processListItem ( childNode , page )
391397 ) ;
392398 value = [ ] ;
393399 processedNodes . forEach ( ( pn : any ) => {
394400 value . push ( ...pn . node ) ;
401+
402+ // Store nested list nodes
395403 siblingNodes . push ( ...pn . siblings ) ;
396404 } ) ;
397405 }
398406
399407 return [
400408 {
401409 id : nanoid ( ) ,
410+ page,
402411 parentId,
403- type : type [ node . type as string ] || type . text ,
412+ type :
413+ ConvertedNodeType [ node . type as ConvertedNodeType ] ||
414+ ConvertedNodeType . text ,
404415 value,
405416 } ,
406417 ...( siblingNodes || [ ] ) ,
407418 ] ;
408419} ;
409420
421+ const ignoreNodeTypes = new Set ( [ MdNodeType . break , MdNodeType . thematicBreak ] ) ;
422+
410423const processText = ( node : any ) => {
411- return node . value ;
424+ if ( ignoreNodeTypes . has ( node . type ) ) return "" ;
425+
426+ return node . type === MdNodeType . text
427+ ? node . value
428+ : node . children . map ( ( child : any ) => processText ( child ) ) . join ( " " ) ;
412429} ;
413430
414- const processListItem = ( node : any ) => {
431+ const processListItem = ( node : any , page : number ) => {
415432 let newNode : ProcessedNode [ ] = [ ] ;
416433 let siblings : ProcessedNode [ ] = [ ] ;
417434
418435 node . children . forEach ( ( childNode : any ) => {
419- if ( childNode . type !== " list" ) {
420- const processedNode = processNode ( childNode ) ;
436+ if ( childNode . type !== MdNodeType . list ) {
437+ const processedNode = processNode ( childNode , page ) ;
421438 if ( newNode . length > 0 ) {
422439 newNode [ 0 ] . value += processedNode . map ( ( { value } ) => value ) . join ( ", " ) ;
423440 } else {
@@ -429,13 +446,13 @@ const processListItem = (node: any) => {
429446 newNode = [
430447 {
431448 id : nanoid ( ) ,
432- type : " text" ,
449+ type : ConvertedNodeType . text ,
433450 value : "" ,
434451 parentId : undefined ,
435452 } ,
436453 ] ;
437454 }
438- const processedNode = processNode ( childNode , newNode [ 0 ] . id ) ;
455+ const processedNode = processNode ( childNode , page , newNode [ 0 ] . id ) ;
439456 siblings . push ( ...processedNode ) ;
440457 }
441458 } ) ;
0 commit comments