From 85bc1e35904f9e39d6e557e07f45051d24b9e737 Mon Sep 17 00:00:00 2001 From: "fluoryynx.l" Date: Mon, 8 Dec 2025 16:13:16 +0800 Subject: [PATCH 1/2] Support for gradient accumulation #9 --- .idea/.gitignore | 3 ++ .idea/vcs.xml | 4 ++ F2LLM/GRADIENT_ACCUMULATION_README.md | 53 ++++++++++++++++++++ F2LLM/README.md | 6 ++- F2LLM/__pycache__/arguments.cpython-313.pyc | Bin 0 -> 2325 bytes F2LLM/__pycache__/model.cpython-313.pyc | Bin 0 -> 2969 bytes F2LLM/__pycache__/run.cpython-313.pyc | Bin 0 -> 10414 bytes F2LLM/__pycache__/utils.cpython-313.pyc | Bin 0 -> 18810 bytes F2LLM/arguments.py | 2 + F2LLM/configs/config.json | 3 +- F2LLM/run.py | 4 +- F2LLM/utils.py | 28 +++++++---- 12 files changed, 90 insertions(+), 13 deletions(-) create mode 100644 .idea/.gitignore create mode 100644 .idea/vcs.xml create mode 100644 F2LLM/GRADIENT_ACCUMULATION_README.md create mode 100644 F2LLM/__pycache__/arguments.cpython-313.pyc create mode 100644 F2LLM/__pycache__/model.cpython-313.pyc create mode 100644 F2LLM/__pycache__/run.cpython-313.pyc create mode 100644 F2LLM/__pycache__/utils.cpython-313.pyc diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..26d3352 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,3 @@ +# Default ignored files +/shelf/ +/workspace.xml diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..d843f34 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/F2LLM/GRADIENT_ACCUMULATION_README.md b/F2LLM/GRADIENT_ACCUMULATION_README.md new file mode 100644 index 0000000..3f43124 --- /dev/null +++ b/F2LLM/GRADIENT_ACCUMULATION_README.md @@ -0,0 +1,53 @@ +# Gradient Accumulation in F2LLM + +## How Gradient Accumulation Works in This Codebase + +1. Set `gradient_accumulation_steps` in the config.json and arguments.py file (default is 1, meaning no accumulation) + - e.g: `"gradient_accumulation_steps": 4` will accumulate gradients over 4 micro-batches + + +2. `utils.py`: + ```python + # Scale loss by gradient accumulation steps to maintain same effective learning rate + loss_total = loss_total / args.gradient_accumulation_steps + + # Update step only after gradient_accumulation_steps + if (completed_steps + 1) % args.gradient_accumulation_steps == 0: + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + ``` + - Without accumulation: Process 1 batch of size N → compute loss → update parameters + - With accumulation: Process 4 micro-batches of size N/4 → accumulate gradients → update parameters + + Both result in same parameter update if learning rate is properly scaled + + +## Example + +Let's say you have: +- Desired effective batch size: 32 +- GPU memory only allows: 8 samples per batch + +**Without Gradient Accumulation**: +- You're limited to batch size 8 +- Effective batch size = 8 +- May result in suboptimal training dynamics + +**With Gradient Accumulation (steps=4)**: +- Process 4 micro-batches of size 8 each +- Effective batch size = 32 (4 × 8) +- Same training dynamics as a batch size of 32 +- Better gradient estimates due to larger effective batch size + +## Configuration Example + +To use gradient accumulation, modify your config file: +```json +{ + "train_batch_size": 8, + "gradient_accumulation_steps": 4, + // This gives you an effective batch size of 32 (8 * 4) + // while only using memory for 8 samples at a time +} +``` \ No newline at end of file diff --git a/F2LLM/README.md b/F2LLM/README.md index 6b79819..b0adba9 100644 --- a/F2LLM/README.md +++ b/F2LLM/README.md @@ -27,11 +27,15 @@ In this repo we provide a streamlined and efficient script for training embeddin - Setup environment following `requirements.txt`. We note that transformers>=4.51.0 is required for training Qwen3 models. - Download data and backbone models from Hugging Face (we use Qwen3 models). - Run `tokenize_data_qwen.py` to tokenize the downloaded data -- Modify model path, data path, and other arguments in `configs/config.json`. +- Modify model path, data path, and other arguments in `configs/config.json`. Note that you can configure gradient accumulation using the `gradient_accumulation_steps` parameter to enable training with larger effective batch sizes on resource-constrained hardware. - Start training with `accelerate launch --config_file configs/accelerate_config.yaml run.py --config configs/config.json`. Note: we recommend setting `num_processes` to 1 in `configs/accelerate_config.yaml` and launch the training code once to generate cache for training data before starting the actual training. +### Gradient Accumulation + +The training script supports gradient accumulation to enable training with larger effective batch sizes on resource-constrained hardware. This feature allows users to simulate large batch training by accumulating gradients over multiple smaller batches before performing optimization steps. Configure gradient accumulation by setting the `gradient_accumulation_steps` parameter in your config file - the default value is 1 (no accumulation). For example, with `train_batch_size=8` and `gradient_accumulation_steps=4`, the effective batch size becomes 32. + For multi-node training, run on the main node: ``` diff --git a/F2LLM/__pycache__/arguments.cpython-313.pyc b/F2LLM/__pycache__/arguments.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f6c42de98cede151da5a675fdfbeeeca560257c7 GIT binary patch literal 2325 zcmai0OKcle6n$eqJO0K_9LL|pZjyf7k^*WIKBcK>NAv3#D)b+Sg>|5DArS@n$IReNRc8Gi4ENQ{GnvS9r?~Z_xI+_ zyZ4TF{eBMt+IV|bZ&N~kMbINhSJ-_Cgu5h5veE*%ATjBJjoH$OZ3}j07kA3&iWDUi zBx{e8ES<4+VrRN3m?P_m6Xq<~GR|$(Wo+D?S1d&>DyGSuika6{Yfvx%3*=!gfxH6wn0v(o zHoriF%qvhppdj-JG$hb4^NYBUKqG8Wps+xrEFe%sAc+M9iV75CLjuJG8fL=+CAugi zP*R{VHX_itKq(d$C@s(dc%7r!2^J|#W~MlOo)ye(JTUGlmGfFrt|->KD{TJ_#J!y- z5>DLL0a~T3u9@2-BBWR@$hu)^>S+UqKDniJ@$RfyM$l}P7$i|S3BTFMYfb1YJ9+u%n!y>AzSQgVS zS1k=!JAw{a&}<5sv&XQM&2NCcOLm-OXyn$VR%ogDjJ!-avlDB`f7HqoAzq8Cx3t+&#^| z_kJEJ=j~~ir6nw>eblQ}8!>FT!Jcsmi(wy4kY!uX+rZBNT)n}cCOIq=L0S3)dCh)K znw1Q?OqZ}n_t7AHHo!8vPYrO8oB&Hve5B9{Uf5py=zrt;`&_0|yWO#ZE4=Ll|P7%wv9iq9k z%AUo96jPa43~8iJ=RA#3$gr8r&w#t|%xpU=5cWnLD2Sed|v5#WwPkx-d?cB2yPy9D;;LcqA z(4D#3xmIlK)6^%aMl^GO=>EyAW2ZNdoo*bRYmA<$oqOj}D;&F*`!H9(xG}VGa%*yC zb8@CJG20k^3XBUaZ=klY=Y@PfxnqBcq4sSca`u${yHlQd_dsTVd;5P3G?QVkh<)j{@7VOOo`EWFC?*3=^Ao&o(Y4cL=auJHQ=; MyJsc06ckPS58dw|SO5S3 literal 0 HcmV?d00001 diff --git a/F2LLM/__pycache__/model.cpython-313.pyc b/F2LLM/__pycache__/model.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6009551f4879996e6d10e6d9cacd495c32bf5335 GIT binary patch literal 2969 zcmb^zOKcm*b#``_q)1A(q*xXu$5Cv@3gJkM6-z~o{46_;ziP}iQx$Z*S&>U}#pP0( zS<05bIR!G1Ao0O)4m}vXH0`D7EysQ?K`&ieDlAL|BtU!7O_5v#IkoTYawWw{ffmRB z`#v-8y?Hb9KD*K09w#tnua8;lDk1+sCLBtO*mx0$dqg9ebe1s0^sID-GI~a4@&M^2 z`-mp@5ltDTfd*5`Qd(U_W+F{R+p&gQKVL7JwrEH4V!diQ)-{tA0`UO0bjijr@b}0i z!X%9_s!2@VOZE~iihE2;Owp7+!cyLc6CCP5rXszamje7pj2ymM+dH>3AA)5@=x5IrTCuvfSWFy;W16re+ z{IUYJ&?|pvz-2o^ZVP93ZqXrSw}n*xsgU|_9*n|iall@QfFfjaz~1z?Ow#JaMbTNZ z4PMa=*EJp2syq5fS{9uyYj9U*X07g;dI26zB-}bHRP>@dKVyn+5O;K|He;JLP%}{R zWV&4_1}k%+`l2E>RkG`bdw5K!fmWnw&RGSM;iZZlC04KLGt6|EVL4_|P`f4~g}PI+ z%A&2oVOyh6F&W;uXmi7W5gj$-s?N<>-8P-FTS+TI;ig>@@fwDgK4Z8Qwi|RVvkGU48d2-@Uw`d=Ovm=>wEl>FR&fb?9N&p-<0zalpIu z?xRbk=B3gaAvdMB>DPpGU!WisUEcrvg7Q)HPV}RWI~_~0ftA$3N2#%gsj<(lEvCjn zqGNgQ3xA9CZ@_M((0W#+35Ks9+DC!72Tz^F*(XMJGsw3VASgENKp;qvq^uj3Sto$B z#Cid7lo()r07aaet{yy}*cn{J)(nOsN5%ID9Q=OyZu!1**I7_LOt7b6wy=GF3`O7L z8!!tVL!w~DIyArOPkuA$V3p!(7|NQQqeCz;1nCD9*a}x18P1NPPmIt4V8yr8^2Jed z$`7Tdqvm6wiR@;Qhcg7~Tpr`fA2}NC-c8<#aGD{Uc9Tawd>tYXo0bPhGvWY{Hy3Ur|=RAxA3Qz&zmc_kh76IQ5GhFhqx zeJGZ68a2IQu%ZroifFyz!ZEq%f6^!2ux;HhuUtqL$F_t86Go8v(NdjV0W+dKNc$Fx zC6@QVZ9vYqcYS>1PJV%|?C$&J@dxUIYI9&@anDPi8K2YN7n^&Iem>Rge`9g?v4!MU zT|M^)?+$+Q+=E23?}g>wr@g(y&ECw0OuPFQqH8fSIQ;0~>&=6&d$IlhJb38W)n8SY z8rQt*H7DsMBY~1o@X1|#AeqLy5Mb9iXwX#>SnwsThr@fy}FTGT4Hf}XXZbPLU zJVsZZANpP5w~3`IH@us-q3Vv);}p;ddJ>E8IH1RbT$SFUAU=AEt_O_AxOXAz<;`ZU z1gh!NboubBf9PC1JpSoZUq_Xl@in5f#UDq= zX=y_GYvL~n*k+Rc0{6a5bwuydDb&9V2-b?UjFS~wL*&q_xK^R_XuL?Tp~xh?j$BT< zfo>uPk{Z28y;%m-4aq;!9JmGCDS8?$%)uWTaA)bcfIEj=LpqO$0Rpq{NE5h*_5h)E z*yOZylYWC}T!u+Rpm1D%6H!k7ne2;9%3CM>R*c;KcAesNAVM=7sv!B}VCqCB_9uWx z;(vZMb>ih@I^wrwzss$45R zZmsOl^^(Q7Yr_wOQ`dFYk0zf%@Dl)Hm+mKyRlwgHW=%C*lk56#h~Jj{@MhS?eSmeE zrvSW9)*|HD1JKDHN0aK%`W{Jr?q891H6Cc8LRt+{F$w=J4u_1W!LNn?TjT$(1V&#& r%h>;dJc_*%9O5U}Kc04`dN_48 zlaHRg7Z;bL=(vw1xO;Z@?AhJ3=bSxf@59nk8-g_bCwrp*R*lfVkcyrh1>j+`6`|J= zk9cYlO;DKP2ux4X6AWf1Sj?(*X3{ia#^#9O z%|tD(1=ta^`uFG?FiO638&rZ+GumBf#C5^)HMRO+y?>v9uS3DQSVIx6_Ydo^Q3mUF zQ(6sXch(}z@nw)V1xuQNOGY2iA2mklPwPA8V8<^%sn~4(YtqOE((7fyf%${UII4DETTKk4GkG z#=XH(I1%svic_1cBG|6)1~?fp=F=g*x}V`|)EWDNeg3H4Zj^j&u(yW_?uLE#>1*tQ z0Xkhw#w{55I>c{*6GwbqKLZ$@{)FCcl)>I$pEma{TBi*1vHeSN>&ZIT;!ghrAv=OG zW`EM4t3&F^fTV6r`doM**y%>c&@oD4cECBq3DqF92$Geg0w`G4i~a=_IhzNyIZA-)k01C{jS>*CQs1wx@- zSm6kaYpzAXeg1Dfj;G7MyJ%r_t^nO{KaNjM9IVSzdJpe{6>Mr@_?FFA1{R>TkI`p= zGpqfw^MUaf^?8gkSjTS#S*i)v_+K?zb;$oc9WqLNk1*Fp>sAeOZ%*g3F9n$f6JhwLz{CI_CK|o(Qp=_4A#WP^bvRh_FCir zE4^-%I^W^^hTy#R>9T|U!9o9z4Llvv@j;JkH9f{NH~><***XI{wcucIhU{m$u3$@s zi?of;8&VYSgpAkYnO0gwFSQ3R0o|k;I;tGYsVob!L zlqjTdC>mG12c|}N{_w%W2S#QNA3Y+Bj?9ekV>7&uQA$S^Q;C_xX)zwXAY#QLi77!6 z#fVZmPoVHgF|r7>j=(IASRxb=BB4|WkjSi~08T-hh@oa5(PvGNs%uS^Pr*_Mfh3HV z#50RxJWT4Qkw|Fa83G*KIW;w{nC3z;NmQz4j!uppIsDw1pBKJ1;vbos8k;)IPlpQ* zgUUQJ7=IRhjh?n@Vi9NJbbtXowut9MD9)Y$ zwreTC>S6+;T!m34z$nh_XLVeoY(^(wJ}m{9p5nEt(~6q|Wpup&Qf@pGje2Vq7t$5;%NPF{i}1l)wrjEiNb~DH)5Vd?kuC8c!~!glI%kN)|%r z1TugjHI^KeLY-4sXp(RhEI=YF<+P-j@nXC?d0sIKQYsWajXCIp>k~=OLzYH&wsvRj zJwH0W9A2_4dA~FL*75Xc#@=(Mj?3DczCXHj=DP=$s6RV+ZIm=OeOy{`IrwVujh5xe z%Kjg8WlP)B2lMvw%lj_vTV}KN`V3qD=s^i`?0BTun-7_*Is|TIVV!yzp!5qpPM;{#@l;u4iZVuIv5z;A)f=)X(Q#%|JoDpt zT`*F^1<3{ZQYG9ARCl^T>c>Hb69@Q`0LwE36O56ik5Q>Gx(rXlsS+m(j0hW7vDR=5 zr93uDKsyx=_Wk|gwC}j2`NIJ1M=Kl z0&`MmAsG|7UK<&)4z;5_+STjZj7SG`Y)&T1wMV%*oLJy`xl{s{?z3THIu%FZ(h%aI z1@SYYMV~?JkCZZXUqX=;N_k-~YnrUE)=(;XRumMQ(L4k9oP9Qw3ZE3_qF72PrEs@I zqVaj*Y$&!U`b?O}6S>aL5Q8Ww&QJ=hXDXVA3kxCXv_j8HxEWA<6__ZC!jj8VajfMQ z7|2jOl2}m88k@k5Mqwrsw9bpFwN#wM;1=e9bHyOqNHw$yE0Oz@7!SpoDqgSZ4X9SZ zUMqB51e%HXJSNKJvnp0?M>zBn?ja*xx-7uPBhj!ZDHQyo>fSKKEl4SW$<_Upt>2#{ zksGIA={XJ)O;K#&L@Y+Ug1I<;igd%jM7)kPy5~YQ%_~jU94jx#)w@=oIGU|KdZl+Y za6A{7lLK?JF39O@%+!8|NC7ZnLeETN9zZ}nVA=}yF(e)^RcVy z&bA#{OYe`)uQX-sb*t9;x6aF!-aEdXSxevF)U1qT>Vom8X9=`F6j&Wiaa;N+0@HSKadQ zo1-6DbK@__@N)_2!}+SltFbGwzwXbq^vf;%*{XqD)v#POoUIy3AN{1J`NQGWSur!0 z$j&D3A#{MAqM?|kkI?r??I@{zjebF^38ZkmNZL#<(vKcgBJ0jaily(-Z!CdF56q}; zNFoydUW0S2-16Q|=UA2PeQU$m7RLvk+Oa0f3^+j+@f?VkkT`w8N@TJ7l$3}o)JvZc z4T5enhyV!i1xq(Ld!RQdrKA9WD%gmEfhNR2M6BTucx^%?iotIn?w=~>^FKZG=>^YE z|6>?@KAYMs5D5hVI=MJE7Xv|5y*@T2N+}A5;csn+ zQ>xo4af!2_VIdBLKGjZ)2G=#jt#LOwP)rVssYM)@xNw3Pv6RTE%N3<=+enHNfxvlA z5TZa)5P(!%e2LTanM+`BS-B{T#3HCeB<(n&F1pg??H4Ou*Q zGPEcGI+pVvLzwY>l^M^vVo+)+fc!eRrt}l+4N1R&;8x#Oc*bZsQS>(13EJf5(LGw=0Lmk*5iOI4MCPh`7Ld$mg>BxI_GJX zJ*_!UhwSON?b&{5^y0|J&0V?X-E#Bpo1tv;6BkEwjyl;Bu4MLzn-x+%%V`{4iK6Ogrl`P2fB zaTkOUl(7JC3k$k&7QJrb&@?zv#QmB_dzm>(J%SsQYVvQ;3f;W}gKG+29b`IBt^4N1 zxOgs!hm^MUOa)sn2;KW)i7a*4^$pK zB#Zw>bu9(TO&m1sK+WoHDfuW&xG0fj7ml?=AonJms(X_KkY`GX+@5Llaao02F}SiT z*U~4q^yOLx@05p;k&>5~K4c zQ<9HWrAjfWmK8rkh^V4YMA0xHb}W)d2Ng;2u9dbm9U&MZeGf8_ICss}!Pf>?Dzolw z7uoMx@?E{Tu0gqLFxPcJ?mDpAHMUxL@K)E@TQe(9FNNOlEQfyJeeIOoHFm4=AmH0R zuB=&ZeY4|Q$D2LZda|{h*~;ytIX^J;>yBS_WW8g*>bUvz8_e=(u5PK(uAkiBCU+3z_XJv5=JL)4?+ddDQf(_dEaf71NEtN8%6K9D8GrUm{V{v707w-$tx z8(9LN$rg`{lLMMuPQG!A z{i;!o=m?%e+M64()IJ7Yy>J`|U}ATCR=hyqC}c`W_+)}O-)5sHV;8F0IuEl*E07UOpLbSVK6Uw2 z-tE2m#FZy6_ifkKdnGXR_fjt`@*5+}fokeKbKS^Z z^j>p0fp^m+-#fD9Kqd1*1*v@Cp$J?_P$6<7eAG-%k>4ttkN9i~cpHR$a4*>eVIe`@ zCINN`!kNWTtk6>?Xfa_d8W-aU=!Uy2k_Za|Ce8>JNtPtp*CA6X1@ioriiQPh8~z%(?RNY4y&n68%WR} za~@Q4@c(umFu8uQdgBwhEI19Am`F7{3xMGZ)b794IkFOZqi^}ln|qe`K;;n*1IC5n zj;~1E42d2frslxwO>(m$ycmdF!7&`*E;MmpL7)TFryHj7(ewZG5lR0(JRiLL{_om@ zY8c|s?g=3xhC}BSItKU3tO$?GiaDH+;Orn$u_ltK=z{t(?INN1OCh|ln1llplaf+F zTGUX6#Qj=ZgBIE>M8h~C7~zYMgeYJ>xu|TJ$Dv3R;wS;U-Nl7P2#3j&fI9fX&{yqe z14lx0ppgbce8&_Lxh*aLc?lC2M=>p?qA>}R3r~AyaKQTF5-@T>gbS1yDNNjYOq?Qx zPQarr3H!7w*iv{kkuWjXic5V2?p7&wlP*kbw_+>0?II@E(c0(#ZX($tCbn38noue< z8u566435XOHwwGfp|TsplOik~v`dU(VxN@~F@83RVO``jF_?3(bAo|&KA#_@Vpj1+9iNzFou_h5Jn@GZQHJc2jPQum`@U4QH zgNN(Ig(T!8IB{wRsn4T13?vapf@ zDPUYABEXBt;zC^WTU^>m5&;Y9F|h@R>%HLdC9)QzcUK)FWg9A~?WJ)k~7_F49ED7L@v; zJVVSMKAi*(rieC(@z5*;_2k14m`jiW34e=>DUFFCOmtcxpNH@zmGnBTa>TIl*++3j zL{POPd=L_nvBh~fpEWJvM0_qfkBM7`1B8-|IC2s3EZmGbM+mpIO{)&1;RX6=e2$Ev zBa#~X?ATF=C--S)eF(n~-QYSQ=~>X9dkjTUcM4syd6jcTv+_ zwCyfxzKgnkgB*8J=Ur5P7kNL~)}5wbvAt}|*jr@O`e{k|pZ-yrz2mI;&O+LrujF!- z-Ew7jnq5T|d1TL_av7B`wXP!X1D2&49yqJ07wP*u5oNtt{!M$Hapsr?_~lviE0f=t zT-tSY_{wmmqUCz(7Q5??we;fb<@rnVmt&V=%e&qjzBYV)GV6Teot9PW(7R1}i{rBO zRqImgRo@lgpLZ^wxn6$D()k60Y{U1=sJV5;pFs_2=aTb|wQPyLYJ1JLyeqe*N8Zwt zb?vxq?M;v5-JYv^uk2kO$hzBB&d6?Gdg_j|cDes%#V@>Xdw)^)cHKLX+>l=$@@HlO z+1_KB*}2?oQih-NOa`Bqofp!F^3KYflarm?O2f_VHyhq#-aYhzL++i*I;RVrZL+iN zX6w7`hrJnRTSjeWD{^d&%+@ULyY9W|m3{r0+JRN}$^5$ZsvC`&+Wy<@!2MEW+X=6U zrL{SS@0P=Nz4h;UZuDdw18M7h3#x4S+{BdAzx6a`L>9ES1yK;Twa^H9+Ajq~CZ?P{L!@jW9Q*HMVWcT({ z$Xk~|?mSbHW85<1UJ7Ly?;UTgT)Y4JV6Jmm?i^n29Ld)1&wBU2JChqbCd1ErEIqo) zdh)dm8P@xWE*=>VpP6|!HxrR(A{k!12cK-IL*&`dJx+fP@_LxQV=28DxtzF^SWe{{ zcgl@B-|@(np|mNFOs`m8w&YNojM`RF`=`i4#OhN=`O?;_?N{1!6P)J;Q+F8llpT zdp7hO^#YYU9+%;VTv=3+Lm>YxIkZbgyE3~2GCGz&_FV4RSs8wtAoyj(=UaPnt$XFx zy?NI9%J|FU7o*E{4;gwZEu*Hi>6?~|zNH|DE@jRk7tjC|y5nwG{^P8>D}!8lM@`Pr zC_5T&JDOe|O^>9{AEASk3uO=6|3#zuheN J+5B|Le*;rz=oSC~ literal 0 HcmV?d00001 diff --git a/F2LLM/__pycache__/utils.cpython-313.pyc b/F2LLM/__pycache__/utils.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..62dc0c1d872aee556388691b28f9dbee689d8238 GIT binary patch literal 18810 zcmeHvZEzb$cHqq5g8&JF;1~E!f*%q;#1Dxw^+kPA{FX#g0tk{4DZxWvNJ0b#-~fd)@tB_v`N0_&PH)oq(t7wH@xquMosP;1A_d z$6mhvFYxj3rFF#UL)qyUV72TI-I@`3t&^UGZ>;hVpc5K@AJ@5enC{by{<_yH|U#pI{}zI z%WwfY==QMzQSAhv`drZ%E5k8o-ORawsGp?SSvNbqbElZbczqaH%FAkn#n4lv7McrNq9L#)>1AZ~yJer$#pLIAr zp#anZh&ueO{uHT!)_`c9bo)|4r%IRLy^Jd*e2R1opU*KampizgnH0$wx-<}^owLlm zbB1Q8ne~O9or}wwphei=0t>A@&G|z9K>cPhjpn99G)s%>0PAs!YOfU1I8^!RIZ+ds z3(+ny^C_}onl}Kl;`E8xEFBDSw8t?QVghpi>SlaFzb`mPi<&70`dw5r9(S9lo^nHy z78;Wd%!xX7%IV|$p@69Mx}jG*5OYDdYe7s4INeyRMAh7!Uo4X7`T)(8$9K*Z@+atM z0>QwxR$wp~Xl)e@5KS_ya|ZaD74>u`A?_Icb4BkG}64nNHWTSc8eGzpyo zKm~zTALn$?lL1LY98Pyol2$QK&Ir1L9wwgsbZQbTv@Wm!z2fuwIR@ktL@h1gXPh8p zqAtj}&$?-#rk)RkykhQCO&VjV?5V`$`Z{nAJ|`Osuo3WL9d?|PlvvbxeA5n>+ZlAK zu(rn@EyVcy>yr??P1s2*8OuqE+G6jyL|*|xgeZdWzhWn>$#`NAW3Sk$wphA_7$_o$ z48lT8!`L!dPl*~BO}qgyop!lERcRO=>NQ*!6y>@h5VLSod6*yrxk1Js5LL6xf~Yx5 zdqAEv0Aaz8s0#(%o6cYeomtRTB307y77dgaltgS82|?H?EAEeVk5#`e0Y2@jr|@Q^fS{~~Z>K7ySdCqi8ZXo) z1(%R&{5Cd7%}$Z}T*@|Tw_0k8=g65PF{;7Ed>efh#$$_fhH=jNeK3|=f<8;VO3ZQx z9C+k+_&J{wbPI+g8ixO=5GxPq>7Y45Mw0e=J%|YnL+X@|bKoIp!N)S9#t#F1kUIw4 zi~68%mSNqCpq5z@8c%@$I*U=*XLq9BIWVI&h_0eIYL?${6%${ zZ)ClmCFp9Au689oqT3!dR|@84WNuyweKdb_K5X8#G!#|m3hHVE*mOD3!fK&#J1Ud` zHzV`rXl}WX+k|qPR+x`wZq9^rcRtb+nwrOiMx%XXAT%bKnV+evxk2FXDW!@JG>fuQ z5vc|m8<`N$aTsRgCzU`}5q7YmY$}_2w~E!;Rn{~GIuY9w*wS8zEiH~sH;xAY>9I=o zo$@8eRm9kg6%R4%9%Ybhi}7t&H-TER`Xp>AR0y!0u$tDPf=`=fH#KIn;=W$@G{I2S z1Zix#or2QuD)iV4dxkZC9WZ<JVb zu+?15JeUy-&DLj%CYY{aI%2$&*v4T&TXMh{41y(=oThh5(-q8?+yq9J0aJz_D^!SM zL~0Uj1loZ)5)G3MC$>o))7Wyzo|ICPlZ=62$c1e^nE2d5DCnUl8Bah=^Wai|NsV6u zxGQCyi-of48Qkn79dyn>=|F&60-!4RANa2jED?`PM0&BHFGu?F>vLg!<lz;)vp}CbqHndT*iRH~g!#>ooFWhh-oPY3cJIWti8vJEu&KnbdIB~s7*t7@1UuOI5NuhHB!QcJ~ zlsU1~^S8!=>&CFLWvTaJM$Q|1U*G$m_Ahlk%*a|gBI{rBWP&FVX;S-dvi?y{h^I-I z#vxw+*y7U`t9_y7z{HOaD&d0))Z5fO{-|Voob5lk|a|mdbMh00$&NTr`A>=FWh1-Q7=PdHPyznPcLa5P7h!aNYXf2-{c~avi&jS?}7h;pFjZ0mP6=D5X@j% zIS=(kFVebS7FXP8e77;GD;9J$NLRCbJglpY>PjAIRCU@%TEdX|I89YzZM$8CO#7DlqsBbJSdNTXuZ@+<{m9rPWVWKr)?2ON%pJE&P-fRs&)=Jh zu5W&S(+6AL+p@y_<)%9gpEled`Qp^)r%=^+*fb%SXk?eD-VyM6`n-#`6`7FSBM)Bejf|&9bNe1~tl3xZPNdV2=p?DdUF) zBw_$01~0In#b9B!{!!Y*LL|vY;d&@MvDq^eP9!S=mLlsP1=rZT5p0v~R4o9kZ($cm zH1IS6?ToDp>9MA!mma4FBc1g{^t3WP!2KWP`X=EoRVXE5s88DSb!o&jk{wXB5CIA_ zXuqsgtclIGo2(THJj$CbP2tw2!=n4NT)CjG12xbZCTpcqA`#p4N?B#0C&BLp`0QiU zz(Kh_0Tr9gniR@lqWg#k{{b77m4O}INeX0s#ip2Zsa*s@b*1!0@-ZUth{4xuphBLYthsKn^|s`w5D)iTWB#aIR!Q}q&0^~ zwn+A#n41KyWG+^AoSkmLTNG{vvQw7NN-8IjDSKuaDZ2`1J3V+8eaa0MBCNiW@k%h| zGK{iG{;mAH@ey1%sqiU_4L0yB`z-Zlt`}NMHb2u?pJSSX4dgV~i#dmP>#(HnE0Sje*s@77p<}PIC9RW)!2TEaAwa=w zOa#2mj4w)tAwn`LiTM8S$~ zdZ%ec6E1GPlD1a5X?dD&?m?S-h0Q}~^ALaRczCmwA0Fo?&Z6OS!tfJ`Ry zUwj~9IQY1d*wRIQQ$y@IaHsT#rM!Jou+Jd-OvHZX8*EOUr&jG}J|Dh+9NL^`eF7Uo zY=}R*5M~$o^A`|%5ugrH!?@-?a+vz@9Mtv%b&`SxIYBvbGL9h8MJ>pQi`2b2WIywj zj2&8K&jp;Zr$Yt65aN_h)zSc;(}vVL*4p-bGVvD^{PEMmaThx7iX5K;ik&c^t{$KI zJnw$?{SN-jtZ-%yotfjgQ25MQ{@gq|vj9M^k;gFoS%`kT4LGn&7#KpFa-PC$ZA0qTHE6EJA2#wMPGQ82M% zca{%Ur2X~Y>cQ6Zd)0Y3Y}Z0~e_!?BuJq4q@^H9U3*k@qR}bz_|C>!(2>(ZI^^iLK ze;TzA{;!Jk!IliC4v;AiUel}K|JUD`pM7SM*xt3^B@0Z}>mM6sPkVZTe+HZiHm>4} zj0Bbql~qGo7!cP#8!D@Tvhhu>1cnWj)uyHkHB4^F7*f1pm6{gL*$}<7aoj7?lQ=Kh zP)Qw72IFk~v!Sx_1&@Uok8hpj5+k|*6^D+fOMo-LdQ*LXs_`Sg&XL#6w6^-`fhTF zwBe(O_~O=qmBlIW1O2!DQTW+H*fL@BHp6CwO{wJG&kRso z!SIIVb|duugYo#Xz^W8YLRm$Cp-)*Gl9FPtRw%vxv6*(mR5H*kO9N|UGk$Y!mV9!a00PNbne*)~eP>1a)pmP;yHg8-R)(Cr^ zEe~%iS$BTBge{IApz=2aWyVlr-;Zyy=G${^1vrg0+q3QFebhb*e?cu_>ouSX$Xj_? zi5z>rb@#V9QP{QOVR+r6tSA zKM6L)JdL5~s{p03b|h#iZ%S_gMePgnHP%k0R3c6)ud#L|(v>%xQIfAh970`sl=MWr zCkOZg&JtRiWT~XYskD+3!#o0W8C#SZKh!$~U-lxIFT61|0P~)-agN&@^nf*t-L>{5 zI8@%6;;r7RrlBq1z~YFKmxzIj6lE1&!F{tZHAl%UwUuOx?Z)TXKTu}|I4uE58c5Jq z-azt8aQzgj)*&S)5i32%mfA}}mfnhOT!Q#M1E+E0IPInQ`1BdF3yj%|?Ikd#7Rg)r z1Jkkg&-z!?ID6yVZMeo+8^^i98W%rZ;}=;z_G0VtM2{(NO3#6vQZiEZt)q()w#;5) zwJFev*jA#*gDh)LkIC_E>k_LyQAT;Ql`3Tw6D@vpQTEi)MHw5XVjWdVCt`(nd$Fwy z>)!8C3zWa~7N~gXEdZ8u2rM3U<1!FmDaBV*VA0!gpF&5+jxILdr@r#w?2cpW&DJq3wAw>f`-H)M?xEatLu+{Um+qzJWiPjD zl~t1kbC$}o{!6f#RIqcd9e{0H3o%6D@$#i{Sd&bJ~z?(^PMEX;55gWwmxQ|$Aw zNxAi#1{Ys#*x&=lqj2fOB6*o(2lovYYgo*;42HaNIZJ@XwOKh~}>*3%^8u;2!b5w7yHYD;UF79K4Bx_aT7$0Ni3hi_(ko zseb&JK6hOPK1ApK1Q#$gi(%~I zPbM4kDt9{=xCi86;QF78LJ~3VPjNeD$Y<#CLAx663I(`jNawzfgJU>QLjbM;lUPhR zygKG!>;~Tg$yr^@hg&2bAKXKF>Mn`otHAvk28L}ELUw7_NW{eqxX1-(`BJ60YdH6h zac~_1>^cv(@n*Ppamo@-(M`hbJnZKH&h*ekq8_{1${r8~xO7H);T_++!`I$ehX>yd z0g1*p7CAV=7quZjI0`V5I|TP#jI|dB_&zCj0|x;dz$8k<{0byRj0=Vg-WX$bxu~5=pmcl=)JeGq@iBAk-Hl<^CKO z!mSX1jinU66;F9v$c`4K_+=kPt}*uuTps`b1K10Zt_k2fG18?R^L#|TkaT+6#5W({ zn-22kj)?kDJpH1nOkq=3EMi*whn&lA>P~>QFkRWrRx7< z?e1{V^lI(y1E{ccY?dUTuER*cmRLTW#!Ip5zBk z2?K64;1&j0G{A-j{HWJ|gF}s-sC)MyA_-?zZvWWA-i z;u6gG!c*8(duv>18$oR&k!>R=V|1x2YA#qBdSGM1rm0mMb3g5So3Ok2a^P0mC%Zo0 z^~r&c51``C)$C4WW0v}P(^Pb9Mi>jCv7j)vfW{UgW9P&97gop4-yi*6=*ES2FRXOl z*4)v3s=Jf*X%?#JTQ&EgvGYrVeEx+8R#!NiS+% zt_bOh_|o><-FGWL?Y+I5?;7Td?GfEbR96^tv4^iMYuJD2AvnBvuX>le!{&y|z2E4F zqN?RuzTpt6>J+MupsFMHJHl0i{IOAfj6=r)!m;z{81~BuAA1c1u7m2sa@S9}utauH z4E~y?{Maj^=r!tcf3(E1T#ib%UOpN%lw3a=Ho#>YWN3ysYAC+GH*Ba|=@%OJqQ<@9 z#{EKL7i#PZH})Vy59CC%N+K28!dcta3aXcTdC08k7HS4i&A@#+Tyu=Kp5RXgku@Y( zFCyzj{8w%-zRL@kGAXw z1wp2R5mW8Mg5oRchec(}dQ?<@`7qo+D6Uy{37f%{ss|PINGX|xg293emgVAzp%Eam zEz6bICkh6FUIc$xz4w9dqx|t#c_;S3;oV-ry@1>c;bRLx{+jWsF=DbvzEn`IGGeNR zRN1d8V%ikVEfsR>QEvT8CCY7F8AiFSw}w9%|9Cu{dvL9+>izA@3qtKaRJ$)+z5mOy z1J}vup8a>We7fbjUMOrvh0QB-2#-7A!kyp{Req3sq$4s*ug?L$mxrQRhvv zzf=BYQT>&UXlbob+Kft@R~Bzq-ndasmrbEb51^>&BJJm0Z5DldtUx7j{2R zBZ?cKBE@w=Q6nm9{IaMynTJcjX$#OkB$W1`(w=Z>UxJD5V?y^R>IMOx;-_cOm^<8k zM(AcyHyiHu^MQHPJr5Fd5d;Sqxk#SEjKrdD_)-`-z}FrO7j|GqT0l(>!vXylr9Um@ znSj77AZ8)LoQJV@lpIp6GUw4Lnm=`#f0YKz2Pp~jL8=#H#t|g;!Ouy6fF4x*?FQw% zAv}KDK#e3wh6<040q_}W7V~xn{O}+hN01nz22@hwph_4WL(F;bP{Py!>tXU3&N>YD znE@V0QsSuUijlAAMy8&%?!)&cel)?;K7kG)IuxPLVY+0mYLz~RPMqRToa84?fwLN5 zxggIrI0C9&z&;?mK*sd`Fy`n2c?9!4L`hy)7f6~4+lB#nikilJPr-NqP&k6bIjS2o zH%0ZR1lus8&jH_>}BZzYc>;NO?ywN5^DWz>B1l5{B()avnIs zI03$wWH3j)8nTn>cLDiAQ!+(RX+vX;t&4Bs~@&~wD$*lZ>#Qh-|PFV@7~a7LukwJ zYSplWqYrq7szZSXr~Towxz*Et-gff+&<7XZy8w4CnL9I|&fM{R>O-}ItL1~}wEs#v zUp5zY1_b9N}RP5IY!S9RAiuV%iLwPq^3p10C{ z*9{#VG0iD?@>q@w=LuQ0@V7D~>>ok!*Ej+_A8s69E33FM`p)Q$iFYPeYCdYY*>c-{ z_W*y$#RWBM8}51{Cfh?8p{rU^RqK~ot$%CEe{83~sr8$a5Efn= zy*her;_Ae5%?B;tZ@D!RpWnB4hx2uB%ttZZ+KYM07`@8O1_bD}uia80x>w z*cmlsztQ)4-x~w35B%q$HACK&;T6r@dj7O0V(=LPb@{~%+fC~ zlpCPrjmX*FO=<=I|?YLf{J6;y*D|dbX&4vP}!&Us08wt?Iw3 z)#7khrKe5(*JWA=?`LfH>`{MSy4`a?{nMiDUbXsf8n%1&YH?e&*J%8CVIB^*Y9U-} zt@h>_9~9-`aGMsw|GmB1TWtKhVl9LZORK#WveG7^K7*S=q16w5e}R>;yCT8$T3vd24K{YeW~c~Scz z{=^HKn}l?KG3H#p7k;S#e!O5m_kSUag#Q72)AW&wB*_Ov@dIMp&k4id5fwa9@qpO$ zb3*@%VlPh=Kz|f;r13;9yqJADp2&HmrN{%z-bVyPw}ao{=&SThvgT0(kze#phM6=y a+Kb;Zi^%-1b`_Hc$*=d!lVpX&%l`+2or*31 literal 0 HcmV?d00001 diff --git a/F2LLM/arguments.py b/F2LLM/arguments.py index b967c8f..77d1a01 100644 --- a/F2LLM/arguments.py +++ b/F2LLM/arguments.py @@ -27,6 +27,8 @@ class Args: log_interval: int = 20 checkpointing_steps: int = 100 validation_steps: int = 100 + # gradient accumulation + gradient_accumulation_steps: int = 1 # just placeholder, for logging purpose num_processes: int=0 diff --git a/F2LLM/configs/config.json b/F2LLM/configs/config.json index 2ac3708..7b8505b 100644 --- a/F2LLM/configs/config.json +++ b/F2LLM/configs/config.json @@ -15,5 +15,6 @@ "warmup_steps": 500, "train_epochs": 2, "log_interval": 100, - "num_hard_neg": 7 + "num_hard_neg": 7, + "gradient_accumulation_steps": 1 } diff --git a/F2LLM/run.py b/F2LLM/run.py index e40b707..0731f58 100644 --- a/F2LLM/run.py +++ b/F2LLM/run.py @@ -134,7 +134,9 @@ def __iter__(self): num_warmup_steps=args.warmup_steps, num_training_steps=args.train_steps) -AcceleratorState().deepspeed_plugin.deepspeed_config['train_micro_batch_size_per_gpu'] = args.train_batch_size +if AcceleratorState().deepspeed_plugin is not None: + AcceleratorState().deepspeed_plugin.deepspeed_config['train_micro_batch_size_per_gpu'] = args.train_batch_size + AcceleratorState().deepspeed_plugin.deepspeed_config['gradient_accumulation_steps'] = args.gradient_accumulation_steps model.lm, optimizer, lr_scheduler = accelerator.prepare( model.lm, optimizer, lr_scheduler ) diff --git a/F2LLM/utils.py b/F2LLM/utils.py index b167d3c..4d48beb 100644 --- a/F2LLM/utils.py +++ b/F2LLM/utils.py @@ -124,7 +124,8 @@ def accelerate_train(args, accelerator.print(f" Num train samples = {num_train_samples}") accelerator.print(f" Num epochs = {args.train_epochs}") accelerator.print(f" Per device batch size = {args.train_batch_size}") - accelerator.print(f" Global batch size = {args.train_batch_size * accelerator.num_processes}") + accelerator.print(f" Gradient accumulation steps = {args.gradient_accumulation_steps}") + accelerator.print(f" Global batch size = {args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps}") accelerator.print(f" Step per epoch = {len(train_dataloader)}") accelerator.print(f" Total training steps = {args.train_steps}") accelerator.print("************************************************************************************************") @@ -165,14 +166,20 @@ def accelerate_train(args, loss_total = loss + loss_hard - # backward, optimizer, scheduler + # Scale loss by gradient accumulation steps to maintain same effective learning rate + loss_total = loss_total / args.gradient_accumulation_steps + + # backward accelerator.backward(loss_total) - optimizer.step() - lr_scheduler.step() - optimizer.zero_grad() - if optimizer.param_groups[0]['lr'] < args.min_lr: - for i in range(len(optimizer.param_groups)): - optimizer.param_groups[i]['lr'] = args.min_lr + + # Update step only after gradient_accumulation_steps + if (completed_steps + 1) % args.gradient_accumulation_steps == 0 or (completed_steps + 1) == args.train_steps: + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + if optimizer.param_groups[0]['lr'] < args.min_lr: + for i in range(len(optimizer.param_groups)): + optimizer.param_groups[i]['lr'] = args.min_lr # log completed_steps += 1 @@ -180,14 +187,15 @@ def accelerate_train(args, pbar.update(args.log_interval) train_log_dict = {"lr": optimizer.param_groups[0]['lr']} + # Scale losses back by gradient accumulation steps for logging for k in loss_dict.keys(): count = accelerator.gather(count_dict[k]).sum() if count > 0: - train_log_dict[f"{k}/training_loss_in_batch"] = accelerator.gather(loss_dict[k]).sum() / count + train_log_dict[f"{k}/training_loss_in_batch"] = (accelerator.gather(loss_dict[k]).sum() / count) * args.gradient_accumulation_steps for k in loss_hard_dict.keys(): count = accelerator.gather(count_hard_dict[k]).sum() if count > 0: - train_log_dict[f"{k}/training_loss_hard"] = accelerator.gather(loss_hard_dict[k]).sum() / count + train_log_dict[f"{k}/training_loss_hard"] = (accelerator.gather(loss_hard_dict[k]).sum() / count) * args.gradient_accumulation_steps train_log_dict['Avg/retrieval/training_loss_in_batch'] = torch.tensor([v for k, v in train_log_dict.items() if k.split('/')[0] in RETRIEVAL_DATASETS and k.endswith('training_loss_in_batch')]).mean() train_log_dict['Avg/retrieval/training_loss_hard'] = torch.tensor([v for k, v in train_log_dict.items() if k.split('/')[0] in RETRIEVAL_DATASETS and k.endswith('training_loss_hard')]).mean() train_log_dict['Avg/classification/training_loss_hard'] = torch.tensor([v for k, v in train_log_dict.items() if k.split('/')[0] in CLASSIFICATION_DATASETS]).mean() From 33af82cc855d0e12a972660f422813db58afc448 Mon Sep 17 00:00:00 2001 From: "fluoryynx.l" Date: Tue, 9 Dec 2025 20:53:48 +0800 Subject: [PATCH 2/2] remove some files --- .idea/.gitignore | 3 --- .idea/vcs.xml | 4 ---- 2 files changed, 7 deletions(-) delete mode 100644 .idea/.gitignore delete mode 100644 .idea/vcs.xml diff --git a/.idea/.gitignore b/.idea/.gitignore deleted file mode 100644 index 26d3352..0000000 --- a/.idea/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -# Default ignored files -/shelf/ -/workspace.xml diff --git a/.idea/vcs.xml b/.idea/vcs.xml deleted file mode 100644 index d843f34..0000000 --- a/.idea/vcs.xml +++ /dev/null @@ -1,4 +0,0 @@ - - - - \ No newline at end of file