From 9b78229ea8eaa9b5d48dd9f7a77918b1b1bbfae1 Mon Sep 17 00:00:00 2001 From: Benedict Lee Date: Fri, 10 Oct 2025 11:41:10 +0900 Subject: [PATCH] fix: integration tests to run on simpler sample targets - Inferring the entire ground truth of an overly complex document does not serve the purpose of the test - Prevent test errors from occurring every time inference logic is modified --- .../opendataloader/pdf/IntegrationTest.java | 14 +++------ samples/json/lorem.json | 29 ++++++++++++++++++ samples/pdf/lorem.pdf | Bin 0 -> 10538 bytes 3 files changed, 34 insertions(+), 9 deletions(-) create mode 100644 samples/json/lorem.json create mode 100644 samples/pdf/lorem.pdf diff --git a/java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/IntegrationTest.java b/java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/IntegrationTest.java index e55fc24f..e0ddb6d8 100644 --- a/java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/IntegrationTest.java +++ b/java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/IntegrationTest.java @@ -28,26 +28,22 @@ public class IntegrationTest { static Stream integrationTestParams() { return Stream.of( - Arguments.of("1901.03003.pdf") - ); + Arguments.of("lorem.pdf")); } @ParameterizedTest(name = "{index}: ({0}) => {0}") @MethodSource("integrationTestParams") public void test(String fileName) throws IOException { - Path pdfPath = Paths.get("../../resources", fileName); - Path jsonPath = Paths.get("../../resources", fileName.replace(".pdf", ".json")); + Path pdfPath = Paths.get("../../samples/pdf", fileName); + Path jsonPath = Paths.get("../../samples/json", fileName.replace(".pdf", ".json")); File pdfFile = pdfPath.toFile(); File jsonFile = jsonPath.toFile(); Config config = new Config(); - config.setGenerateMarkdown(true); - config.setGenerateHtml(true); - config.setGeneratePDF(true); - config.setOutputFolder("../../resources/temp"); + config.setOutputFolder("../../samples/temp"); DocumentProcessor.processFile(pdfFile.getAbsolutePath(), config); - Path resultPath = Paths.get("../../resources/temp", fileName.replace(".pdf", ".json")); + Path resultPath = Paths.get("../../samples/temp", fileName.replace(".pdf", ".json")); File resultJson = resultPath.toFile(); ObjectMapper mapper = new ObjectMapper(); diff --git a/samples/json/lorem.json b/samples/json/lorem.json new file mode 100644 index 00000000..6f830274 --- /dev/null +++ b/samples/json/lorem.json @@ -0,0 +1,29 @@ +{ + "file name" : "lorem.pdf", + "number of pages" : 1, + "author" : "leebd-public", + "title" : null, + "creation date" : "D:20251010112501+09'00'", + "modification date" : "D:20251010112501+09'00'", + "kids" : [ { + "type" : "heading", + "id" : 1, + "level" : "Doctitle", + "page number" : 1, + "bounding box" : [ 200.891, 706.938, 394.152, 745.132 ], + "heading level" : 1, + "font" : "Pretendard-Regular", + "font size" : 32.005, + "text color" : "[0.0]", + "content" : "Lorem Ipsum" + }, { + "type" : "paragraph", + "id" : 2, + "page number" : 1, + "bounding box" : [ 85.034, 567.936, 502.306, 659.761 ], + "font" : "Pretendard-Regular", + "font size" : 9.949, + "text color" : "[0.0]", + "content" : "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum." + } ] +} \ No newline at end of file diff --git a/samples/pdf/lorem.pdf b/samples/pdf/lorem.pdf new file mode 100644 index 0000000000000000000000000000000000000000..eb189eca2451cc5bf6d6055b863096c4d10b8236 GIT binary patch literal 10538 zcmeHtXIPWV);7|MNLQK=s;EFn=)FaH@4bakLlQ%f-m5fG5CLh@1(7aI5CjCIS7}mI z1S!&`%NulWx96Pwooip`_m?X_=2>&E^-Nha&$`zPmkvaU55zA*&UJ?Uk%|BW07yF* zaw#dGk~16wLj#rIwip-$21VM#fEqA_BgP2;24hirKrMivjEp-54YPG4^-9l3_B7C+ zyc{{uB-*6&fHFF1(A7x~SB1PuwbW1R@1m!UAVZ3g9pc%M^#lRTs~03O~v(_h=r%p99(H?xCA&nzA+*j+lq6Rl~ixa z4#Lq%iCx-0n6zNH(k&)!oX%CN-JSA6NV8u+t)JaA1Z2zapOy#?&x0BM&+hv6a6ae* z9&iZXM{k^Ym2T_JVW3Ix08I}?M}MywKQ(7z6a%+~enea%P0Yf;faX$6N3Xyo(KwC| zpovrlQ|1LUj>Eu-DXq5~LnXW9gqXOPfE%A%GtX$C<;mj*`*)J9!UDfCKv)mjz8MFk zlH-&Yz-lIsBHf(na4c6Z=ro>2_Spb&qAQy{F8SYPY4EesYyZIOc(Y_?#zOrd0FC4t^Ddewf>ebFxr3~<1B$lqd6V_Ycc%`h29r&2^Vt~X0$6Z(ZcqV8 zeC=-zZupbEQN}y$iy2By$v7Se`R;FAr}4e(GG|~sR2c%Uqt@+TOFJ>c6$0SglEYh( zV_L?eX~BQdLRE)DF^`8U&l81z+XgfrLYr^_0=PmORFh4g6jEw)t%%Sgn@T8D1VxVt zhRzF#5%CB2*x=7x&=a_n8FYPSevcuM14f(L&PCKQqXEvP6a2Zve6bJjt{%i z+Pw)DW91L2ZFz{o^P~J4rrq{vp2~z&Hn@-~h>}VoCq{tk@@pl9`(B(RRBR;))Oonh za#IN`d93x(D)$S!%xMX4-2&-oB$2&kr3?R%ll;W!iAZj-E|E?~k<6&!D31&29+jq& zYfj%#|1u7SSAroUhF{4z`|;q_<*H?>U9lrN!q6wsSWrBDpawHHT@`K>Llx6Vym&an zn^@a|3X^9NbCsV}83wJ(8WyniUi2Mp?kn6-tJR@BXQ!+A5*HIy-m zrjFqy^(^H^EJer8X3rMvK*^6HfLJy@i;agSmgISuioy+Bz9$a`mBiHR)V<_6srRC^ z+EF<{brB}2QV=O^X0B!~T@?pTk?X;nY;J7p*D~1Ovo`?}yOr!I&{C!vvkD0AEVj98 zuXM9-nB2hU*`zhPIm&~%VWI0)5M?Z6uxnuPETzoki5KLy%GOhAt8SIMrR}FyHm`7x ziHzMWE*tw?q-}}!`-|*%K?N9Lk5dn>hP{T+20@=UKDs{PKI=aFyCg@vr<|u(kD8A3 z2_*>G2nh&F32Vu>B6!+rN^_ZSq?>9RKJB&+l^(*ygi&e{f#Kl@jZ4*Mz)6z5EH}n{f z!k;2jIk7XW@Tp_*lKg<9)wmUV*`=~~aK~?I->xIurw1C_eFq~y&@Ae&yU!U9h)&jy zdA=Q(MTXB^53CMMI1@Qzx?px;Cg^j}Q@PHThr#~xdD#T=WAYRseId76rP)v_HeKHm z8h6N-$d)|ALk5Ee*Praaj%`a0_h*eKp&%QsT2+8MvGiXMrgY4_-ae$4&&_(S29Bi@;MlvJzC6S0LO z_K&x`WB7T;P12hW4yZf0(z%qWhwg&z^4;AKf09nvzdIb1U!z`F2mi|dl_rx2NrZf` z@fNtG%~{X=@S3#R9nINHsnidddiS>tS}q%qWfmS4j^{UPk!en6eY-Br-6W0o=zhv= z0~M+Ql4Ed<;0`#ls<$e-Dz7K~(Yl2Q7b&$cOmQ-%vav&Ga$q#59a3opPWzawpSZ&< zDzUGUiOv*2+SE^OYxZ2*_TPDB6>QdOIb@z~UjD)clfU%#$Z)VYzr@FQwT$0Py7>*V zFSsVG#@RKHr<$J^=vUA0V5xoIXeceGVlQCTJccg%R)?ra9lU5dYKmtm{TA4F)KIUQ`?UH?TfjoVm*6No|GxOX zKE5b!*R@SK!F|H}Dcw9hTk-qoJJ)HgdOE^qM4jeK&_DOY3u1y39k5 zl`mgDjX;~!eKykXO1qZ=Um`c*gOfW;eghd__Sa;+XZKDj$oj9uTiIK#HGD!nb#DpC z{z!DpDnWRaP%@G^vH>V*`IRA4)bB&|@xu}0rn1b&%c%9?u9N!{gGKZU@kJI*<0)$R0ELQ;1GAq&&n>MYXlF*roYqGGj-GGs=)WQzgn`aisy`LsMUwNH?1?VJ&zgz5{jFdgL zE_yX_HvAQ~ftlDnHD)MtuzK`TH_&~aZ`JLrF~yTcuzsUrR4ntiMp81uhhML$B$p2P8lJ9{oWf9%)XSUbC}Az-GcD!k zr*}rq&MpR?pbn4+;5r)JQW>o>*21a>D+{YjtX5NT0j{!v6tY>7tbzG>g??hP6d3f# zt5EZ+%uB9G==6J*7t1=A9xV)HO$D=lX&+I4S0%)9_-ScGszE<s%^vRq&dq08aQXcsJ-kDo1Mcy|`*S3z(2xXm#fP7X%Am*g>DsdQD)7AT`?YW2 zS04C+9E=^KDu$A82S#J&Mc#v zbgvVaOyO%VlduKA#pjsJfTDP+%}$K2un7*D9cVCL6K3R zf~rtNJe6XK`nA_co6jX{vzyx6G+nT8wJLR`-Hc3-?`cQOOQ$=A<@fdv26Eej!*s1^ zgj}sy3jkmux(4n`Fd4EBFCvP17+i|BDXypKSoc1cdTy(zSSFTy&6rlo%#t9Jz7+R# zzPm`-M{D0JZ&yCwWGV@aRv}!Q60T4Tlas%9V_DsMT6KKn+D>XNw_IJ&uqE%X7;RC^ zaIF#OVA3U7gO<`)Vp=pNv`JAc)8NE|?}XQpY8m9IU(X@mjx8XTtdDCY0=B0YiN4B9X@3kz@q6fY1gPiIiXxr zMmKF(IYu+ePB+a{fiBS|xTzdy3~VCrXrU63^!l-`rA=-8T1io*Sj)7BurTRxpWzc8 z6Jyh_Ms2$LA2_QhZdB1g8DJpEc51q=HG;iCn@M(IkDSRE=e)klmu}sNN}%8!`}X9P z0W&*(grU$dD6HJ!o<@MYJE5e&L~H(uEf*=X1bt4MqmoevsEsmy`2{Kpw*yX);fF6a zKZtrbi4lJKAp}MIh*?sQGa!T}@C3RsW(gM2^Z(rH6>=|s^+FI z5zH>&%Sj(}@LqbWlly$k(UHXv-d&pH#SS&B@Y@WdPpCLxV(f|l6b8le3rb$~(>S&l-wqtS*ct0@S~~$8ZKQR6CHVM@_zsiXxkL6&_SfEi>!`y!zCNv`uQ_ z`-0yErK)qO@fFTI`gKucP>+S>`%3T@7amN}wQFsrs z7RG-IG~K*I`6xW$c}(7F&XIK_O3{xqI*Q_1W1T1767$W&%A-XSYg@A_#?_oRMBk4x zH7{)3WSdQ!TStewz``g4uXkS}MK=TJOCH9&FkuY2CCqNZTK9-t*zP(o){pT!5x?j? z+P~Rr&v()DO&0(X1d0CIfqx_Xf3o}YO<519?_r1WM!|q4ni@cZ^MWP}V{31VvHi6X zOUZhnY@x0&48RWN=!}qN-)MZr4sf=YW;Yhr6wpK|!knB{e9$m`A1wo@k1JH%o?TW( z%1gq_4dsTl1@Ll%Bitptq}gq;9|M>KwtOxIvIBlVFs{<<^5+!*6HOg}A`%S)2=NQ> zK?Ok~03jiMu&|&w7{m+E_do!&p%{Pw2mlrk5CMQhB|t(Fg5vBlQfPYz2|b9?AMe3F zNwYg)FenKi(9_eC-&2quiFO2n#Kpyd0$?B*%!ftrxqBlpwqAS)caA>{Kw$1rv@;6h zj6?v=4cgivJuuSj*hc?4DTP`VbsrnNZ68tI zAme0jwqx{G3j)aF<|r5?M7j}xAdq%GXvyOzla%j;)^FVCc)ltYU$y>K!}#6Sy^OC$ z>G6%w46VqT_>NBwuY74JOA_XqmeOZ-qyv`r{Y8m6!mJcZDDT4e`oFDP*4oCS=r7lb ztrV|Mm8>y~UyvZ;#tf3l+2s}g94WNr(9G;YF`oXXBMn9@rk|d6K&<9 zt7zVg(5>3deOlacv_WM?8-v)V`ZNiuf8W)t`d9g7^V!y~If&2jvb#n^bi7YW*%q;GBhX)?RpyL*J!v3B>`4#^DNbXc3wx z>OzpuySF?d-a=pSxA9`IAvVE14q=v)orkv65|iu&AzFl66vf^>x=>cNM79+R_6b&; z4;#56*B&!OofJ0sK*%)h69N6=*=eL9Fr>wMp~CjT1V~kko8+2rczYA&SH7=J)$)xv#-19Y>ot;-6^aZScxzI!E0mt*(fKvw zSih`A&Kb8IBlaR#b;duA(MdZecCRm`RrtBhjc>O-K16#?Y;+!|jGXa5zT;g>YoFRk z8r8bZ;jlU+%0@r`pbQ7e02ct)#CtrXZS00oo8>Ibv@5wQ)>>=4YHS0gvzbv%==;u4 zuT|5D8%JWs#9hq-uI62IRD&Dlk2CIg8FMDq)s%G4^TyIg`5vix$afy8m`W+rd0EC< z#VyhAr*tii5GqJ6-PGkb{^&lEkulO4@yc!oP7h{`WMsUTrlHKVh}V1IHTv;x{5?Uu zTuE7H=hP1}S!=P2x5A1vRgQ$pI446cLatwf7srOpB<*D=5l_C#k0;fA{nALkom!F| z9?Zc`rIyR(0#8i|VhEmz4ZRUjx2VEU@ltQ7Bfk49EViGr^raLrfi2^Vg zyT&Qk#zz{QYu^)BPd6ECKP1mmD`#_07+QOGRga2K3`L$T2Bhv&>kgN@?xdCauLg)- z9g4L_j3Rt$n_pHJ<(89>T_Ndt)>4-_8J|&fk2<@0xpyt2F2|v6FmoZzO0{=H@|}ZB z+ANu2tz)q!Gwq`2<%xK=S+8dWM+s6qh*V7XJptB(9OYR*WaFFS3TyaAT5$3=WJ1X(1g znOYhpEALEA#GKoyV(fYF2=B(QQrShA79U#mxdjYgm3+@Cm(!vs|vivNW!dib>yh zn&(v-J%wV5VS+OG7Z*TiCMjd$kVb+5t{`O=iC2=1%pm0u%$b(xxN{?=xu@}`!NuAV8o{Lq)DD%+$vBb-)#tAyXfJ!20~ zveA@Xh7Ybz=;$qIIOw?oVTIY;+toR4-hT84awyw6$ zintG$i5(LqTAw@|^B=G^+Na-*j|!L#^`gm{?5B}*^@L|*mRDNCc%HpNHdtSmO#O$u?LV#6E53ghgr|*&( zJ4yvzT=2dXFx|dDMkL%O;x?X;M0fSE7JJquy354|+=e#T5pm-aV-ir7C2RD}O67CAmY)WgHMREyvApEkh)|H$UQ$2XP_ z-n)beP=;^#M_V9(m*Z9k_@i`Q(7s%zsPV_qd{e5E3)z2Ni|kEBIOCC8b31|~(>E=o z4GOFw2QJEnBkT>cPU<^eHPI0DN3;0g%MHe4iDxBGQbPvsu6kHG6={e@XY)V~J{RxU z8CNvEgQ|P76I(yyo_-uNVVKrC3|xI-Ze92#hOm~6T{l@le~D2>c*I>Wfw;-%jAJ4& zeRYmzV->>wjTe-}DBP!?+a>Z@L7uAKF#5w#DHxD@Hb%kgD!Df~F}>AaI-Gc-Si!M> zPmW0YE-qJQC2dlO5D(t%M>F~?q4CRT+zg$MOpJpUMnet=!!w_aWaC2sna@Hm2{Sz5 zmZy$K>Akj&IOaTo;oAj@N1e8YhP_JneQn(R{dz<-nd#GsM(9#AZ!QL#l} zNge>J1HhW#7vL8b7WhNuPsrc!lz-!Oe@o+R{{f%0MM)@PPu(#{v;h(cmr?OVVYx4s zq=3#_BESz87ZbrUp?^b3{Sv?+*b{vjtcEZjNC5jc0D&aH!V&_Yn*!nz0@!~18}f&y zCeq&7!TXBO)v!E(8%!;1g83~R{4!`WU!UO`kzKtN1@PhMC-kWWZd3Bo5PEFi=uAPy0f7ZDei zzoqokf&W$afAa_Il-i!>q<%~N*g8AA{qT$PIv~Ow=xmSeHR<^h$3i7gwrGqD=*M#Y z_4un2ibVYI8*F!E6njw}N2K^HLnF|4cgoJ-32*Lk7@KAC_yJG-ipg*q$iHU=- z_kuuzKU0RG&xD%F~D5=7XegBI*Y&;niVHkaAk8DH(u)H;rN` zxdetIyJpQyAov^04fk-OD#{@mZU&Pq8Y5K zVn=w9(W7xop4P~`Qsydw>2{M-C-h#Ld(Fnn zFSGD5F5t{dI+wzswZL2ClC#xzQry7T+iB+U{)a-Js;ZP7$zb7?DN^M1{!`+z&W@_r z$C{~TDWm11B#j3^y1DyL0-iIuz2zvww<@sfw1gUK(=an_nRq*63`rZ=dcJ%V37e5T zR(!0*(eCvxX*3wazx24M(vbThy89Lf>QYy{U-s||d%iFH)3~&JKiQ$8V zUOBv)r~u0|S!UmZ8yYb%iD$BFN1t7cNqRdC^C^M?Z2Ysp@y?^Va2c@apK zE|q(A(Ij)J0#~ZO-m0=FG4siZW{l~{X#aY4fxmmC;Us=%rfP6xE;_Q6rKa+G+X-m^ zZOpUVMKN|Xqs5ua`{W0Fu~qcLY|}a4Y=d>e-Istl$tN7Vo0!S$hfXzubO?jXB5%9e zIullttyefaD2jCA%vGCJJ<80yo3biiApacl>!14Gtnefk%{+bk zIB8Grtfg|3__;S;>$g!~Zn6MM7250XOZLY6R7cGMb*Wo%bF6W2aL&$R4WyB;=1Q z46WK(8Mi#FCML&SVbPyv1{3c{D;GSy-C)98+b1`!l1LN`GfRuIueQfczg3bg^1vzx z6j8mbaml{x)s^RLQ{SL(D&JIGGR`vohD z))GVui|z97!QX7^Ao;v0d>84}D)}GWj+e4{*8vdU`clMc#-#gEH%2h~tA9dc`jXSj ze7!hvC+->F-SGA4mQzvQWZ;hh!o$(T>3n*=$p7{16)gT&T=_Rw{Zt|5Jaw<7=<9{m z7IPyOId%Xu88aEzdALy3&X#A&;Kt2h3yU3>@~QHv>y|)Oggwm59BU9Pe*WNq1pJ;d z1%L#EfqFghz+N3Y7M?9uK8ouFj+LXu!l2eUCq8 z6wh{+omj6xCRhu#5f(mV+0DD+KfkujBm2k5I=ft@MAe%4h`u literal 0 HcmV?d00001