From 8264dd1ff485bdaad21a33876e9456213bd77a27 Mon Sep 17 00:00:00 2001 From: AutoViML Date: Tue, 26 Jul 2022 18:08:07 -0400 Subject: [PATCH] New version upgraded with features --- deep_autoviml/__version__.py | 2 +- .../classify_features.cpython-38.pyc | Bin 34876 -> 34981 bytes .../__pycache__/extract.cpython-38.pyc | Bin 36778 -> 37248 bytes deep_autoviml/data_load/classify_features.py | 43 ++--- deep_autoviml/data_load/extract.py | 141 ++++++++++------- deep_autoviml/deep_autoviml.py | 24 ++- .../__pycache__/create_model.cpython-38.pyc | Bin 13089 -> 13118 bytes .../__pycache__/predict_model.cpython-38.pyc | Bin 21276 -> 21244 bytes .../train_custom_model.cpython-38.pyc | Bin 25221 -> 25488 bytes .../__pycache__/train_model.cpython-38.pyc | Bin 10818 -> 11120 bytes deep_autoviml/modeling/create_model.py | 7 +- deep_autoviml/modeling/train_custom_model.py | 89 +++++++---- deep_autoviml/modeling/train_model.py | 4 + .../__pycache__/preprocessing.cpython-38.pyc | Bin 9423 -> 11647 bytes .../preprocessing_images.cpython-38.pyc | Bin 4047 -> 4014 bytes .../preprocessing_nlp.cpython-38.pyc | Bin 11694 -> 11736 bytes .../preprocessing_tabular.cpython-38.pyc | Bin 44599 -> 47525 bytes .../preprocessing_text.cpython-38.pyc | Bin 4237 -> 4205 bytes deep_autoviml/preprocessing/preprocessing.py | 149 +++++++++++++++--- .../preprocessing/preprocessing_nlp.py | 19 ++- .../preprocessing/preprocessing_tabular.py | 148 ++++++++++++----- .../__pycache__/utilities.cpython-38.pyc | Bin 38796 -> 38823 bytes deep_autoviml/utilities/utilities.py | 9 +- requirements.txt | 12 +- setup.py | 40 ++--- 25 files changed, 463 insertions(+), 224 deletions(-) diff --git a/deep_autoviml/__version__.py b/deep_autoviml/__version__.py index 176d0d1..dfdf086 100644 --- a/deep_autoviml/__version__.py +++ b/deep_autoviml/__version__.py @@ -20,6 +20,6 @@ __author__ = "Ram Seshadri" __description__ = "deep_autoviml - build and test multiple Tensorflow 2.0 models and pipelines" __url__ = "https://github.com/Auto_ViML/deep_autoviml.git" -__version__ = "0.0.78.dev2" +__version__ = "0.0.79" __license__ = "Apache License 2.0" __copyright__ = "2020-21 Google" diff --git a/deep_autoviml/data_load/__pycache__/classify_features.cpython-38.pyc b/deep_autoviml/data_load/__pycache__/classify_features.cpython-38.pyc index b96bc36aff95a2ad2642ea4170fb60dcdbf36682..758e19475097a37731aa035b83885b166b92dedc 100644 GIT binary patch delta 6899 zcmbtYeQ;FQb$|Em+ugU{`dY14-x42U5kd%pu#rFl#5d*}1G0sXxe`b#R_N)0`FL9y zB&@|YRB+utJOpRuX*_8rlQ`XJV$#%fnvAD*#*Lkc(^%7b>^64e#KrE^ahi$K^mp#A zRzgZAX^Mt(?>YC}bI(2J+;h&`Z;z{Aysr8V_yc|;_gx9LXd||`#j`I{gS*2@+%@LxVcf&L;>}>gu#fw%vOXUV z@SI-8bFZ?vJE2%Q&udmJcb{MK=JSG4ox6GPs%p7;VY6b|EjP`=3U^0Uz))BjVOJap z2X+?C*crZOXK{9C@zl=Y%vO8Cj;$q`t;(3AN9Co{o8ulHz=37WiX6)`V?}1H%#2lX zf57rgtJRm2%$-o;UNaBZ$*znr9*Ek;ZHCZ71;OeGmT+3WW`$r_Nfz*Vyg<~2s@Qz7 zD^zp;0>1F73X9^-5aA}3EnigSRXu96a!`p?Gll-R9HESK= z(&y(9_z}j*gwNtE_u_lG<9^EzX|uHthRQ?YKf)_(>!ZrJHuQ4F|HTsuUl!HIHRJ6$HmekSiu3>I zh^Q2|ibKP6ntf>htkdkq8OxWjN1fw#u_PUrS*fq4 zrFv(CaGhu=32m;M<%yh(zvuWfL$930pKIm7a}DBFNyVz=QOCH$_%T$>$5RT!l=V^j zxML`mp`?w=#+^oMn$lN_DPiSL7Lke)X8@|XjGu{<6`?@2RSZ*W z!2cGGUM~hKI)-uZREn`wa>TgD(6G7z&M8EYAS(lF1D2D;HUbM~u}#4IS!}bJmq3-w z?ArniwK0Qj1!l`)+kk1qS#HH+@G;OctpF`W1aLl3YPuSogoFBZm|C+ zM#XqcJiEAyjozp&P9Hg&S=85FY@;KMCkQ-A;3)!E3EX>r#BN+C>J0*dz|#c&jKG-q zq&5Pr%9mWl4ZpJFJ_qR*(Zqk(R)FlTTjZG`TkQ4}h?=D}sk3#L*=l)Xn{kcg{3(HP z0<*78wAtFHR@Fc0V8f~JtgK}0QtBs-gEsah@!;y^x%Ls;6~)vn1x=o6p-&ahtS%XL zQqRcssAj}akbuR#me+XNbRyBXkT|>}D$S5`(0dXdk~4{W!0iLwi; zMM!$_TvJIh%yYuZ1iM0w3Z%yKOg&*YT_dX3_!J$69erjl&l4?8q2YX#U13u?)@ z%$EZ4(Ht^ffMabuh)Twcoe2p|G{6b_O*FrBv9QwT;YB=5ulc5OOOx-}VddkB_9(-y zqr5n-SsG?%r6f6nj1uBJu3S;Yn@xeR-tpad(DYi06@)||c9)`_+KeZJuepr&?LhUbF|*$ku^0zZFP^tl$JC3o=7sf=uBP-)j!b{iqMv z|C#s*`+xj@*x$7#B=;9%KWR0xW(7Mz`hKh>5K4UzTzt5%S#mgIa)E=ETwNT zUx{zr#;a;vIbQ_TYqP4ukT84U^q1N6s`y}i-2y7#xHgO)FmcUfHJLZ#npMq8dQEKJ zaJZr$rMm%HBTTfpZW8~vp_Ao_Rc-adq%lLua`UF-Qr-*`$P;8zSVKu+0=)$uqnoeA zdQ0ZC1)!tgW$CRM`odZCbs2iiEc*Hky+XX#mOFeD(l`b>|R+=YzhbpJq zf=Y9svg^k7($Y%a-F>RF>$Ev=ruSt3((_ddhAP3wyUp%nV#oF`hX1!}{+w{{_;dEn z)Yo?GVC)s~*#iUNpO8lH5tuIg>wta_t5ss{7MJ+y&Ib0c;+>sAN19jsdS@AXO@w!q zu=mBvU7xUJV)5=Hj%P^5H^jBw3)aZmE^Gg7B7Td&UlWiunXa2|O5UZJEatLDkY#&Z ze7w6u9Ne>k{Z!oCQ_emR-`aD8O^Wc|rmBAyL@bRuA%x$3{iyFkXi>p#uA4F}oeE%DUBA2P4#Iy67$>qKZLaG1bfiSa|h z0tY%g;~elwC)&uT|NB+8LEsAyZG6mrL}ieMH#BZy5gP>t7)*C z3X71f{jiXi<>Ak*<}t&ST6Z{yF-`1kzscOx~iw-=1y~cw85x;SX=HDlviE|HC zE&Yt9($~MGnGDWf(acLbJp_CNq(W}-_YZAkft3H@Z!&gWyxg&o6$x7A`M_Uz9W zzdhn&KNPOcrQxrds)Y|G&d{{c94U>fn4cB9JC|>XHD*bwTqagEia8n{l(@LlvddA2 zt85nfydG6HD5^zGKM8%+V#cfDM&~aoDxs|cvmlfZGG3TRM-U=#l~8?B}Vck5#Z7Cs79JrY{gE7T@9htV}HHUdYPD!R{e@2^e;X?{`O8N$P5E9Bp7F zZK)#gj`;40#r5xEV?;9+6Gcjj(5#$*G?xVO8(|@6S43Iw09y!I*VN!UbmlVRY#^|l zzzVUfKgc3CdPAb4zlhzD{P(5a@Bd2`UIhciT)S<@K+ISvzBDkO{am~@aEr|o(NiOl z+1DZd03GrS0U3h#5v@jq1{?H6m_~d?nbh_^B&SEU=VG#7Z<*|G+8HG5+r7@H=PqGLy!pUNypem!|NERR|Ipik- zUB@p9&3Ov~RF4O@Q4PhXMh?$KB67+^6v)u>c&<2NhFN~S9Yeqx?acB z*h!#CtUa^S{uZX^#ggr-*uUV*4L>oT*_f_CTGoFuIM;#isPH}d_^Q>}g=Q%xi)Q&A zA>WZZ9g3>tD7q3Ufe&AObP4-Z{PNNA;aOjUw@HjFI@4dLAAvS$kDWew>RkHi{eYM# zT~4}msOvjlk$Kos)o9z;{-}e zl%R~Q;627fYfq7=ywK_?qOHy6_|0A{&wA%a;XxfMi<){s~~x`3Pzc zRS;d=9Da$tmui13$&eF%eQ^bIiuy}ikj>6s+U3|mQgrd!rRwkpw0ZhjO0WJqXpaef z{Ml~oE8Zs48$}nW6*hDk3iScc*$0!v8W6#&v+@rbstf6_CMvicbotK delta 6923 zcmbtY3vispk^bl3-G86jT}iu=R$8qdmMyQXhvg@hSC$`=Z7kdPMX=>aLim@A9$s0$ zevBXgB8+5f4cI6iau)|kb_wAwRret5)o~#_OkEX~%cVFfAr$2cz9LU>5$;lVAs4E+ zyRT<|tz_w_P>EFAGt<-E)6?D4)AN7xOY%$8a^9VJc`gZmFTVe3Z{@f1E(TlW2}{DN z$_bllO(?oWwM{T4p;f3=?W$u^O4xO)4t&Cqv=1t(GbAOPQ}%wQx>Ps+aj4yIno{~@wQQC-;a0s^FdUOaTW+=yv#l`O zO2O|l+_P#8`cwI7DdEuzu$|n@1XI03mT8N|t;K<0RhlL3MqW%J7?M%}wOTFY^~F_e z5x=9jc72Vy_>v5ZI)w-qschs8$!cZ49Fy*n;?+!|{{{15{Kv0`=YvsIo zgv+2;T?#+KIA-`nbJWpWsS{qq3u(*D7?7tdQ`RXPBEhB}Sw1Hed?EzuOtKjwt6D!x zgHS;zm@fBagTZvUO?z8-(Zs4n{5PRB%NvHIY3tbQrvFRRlG-?AowjN>7TBzU?<+0* zqa&h%KUG>hPOI6*4$fQ6rkynMlC~lHv`u?+!Mb(it9a*`I^8i?lI2wYP*l`q<1#1p z`&p?Tm}e%0>v(Hf@%FlTp75LgUf@p?Uonq9-|)k8EBI4o73-RYlxanK2PzigD1~6k zh9TRuG8Q*6S>v*4yS6Ed84M%&1bA0Cv}mQeYSNKpZAc~)Sqhv}thIVbCVHf{HM}vr z+%iMI_7Bpp<5$D=78r{EQ@H8CZ3votXu2Lp-Ar;Kn3}V}q;8mv)I`7xpTL~(u}(WQ zRfLa;MfmJ4FK016RKBinetHncD_&`u-~RsuQ?odL_R$-rT_f}C8Z`0Evv_WgwA3OO zB9~>oSb{_(R=Oo*2;0nuA|W^I2kSYsr;U8x6e%44e7quYOgl!OHiEEM3+z*bcH^x5 z^lDfzznIc~&X&4MUj)tkYOC6MURJ&RxEkh+V}x}OGF78Gd9niaEK;GiM|3NkZBTQTzIMr{}cbE9^Qyt&af zy&#Dj(&XEY5$;EGv;!kcZge|F*74kMCx&uv)B&OSxzR4nQPe^iC>aw)nPJx6=>p5cMx1A1oF;yp5(;1AVqU=Q+FY7byxNz%Fr zbnq3m0ls4C5zIfjG!eW&$Oj2H2)GGM@ksQ1Yy#bHG3`8I1nU;+F#>{=IHR(*jSw9< ztSR8%qG@)4pIcVwo5GX}T>pty(#H8Wmi2~7w5AZq$&51XDj@_Do8qgNpKB9j7YX|? zfky~DO5j#?qBd=suwNo@g#ah;IDrfN_sgRYx2WzCHvH|n+o09Wx*A}7^-JAz#PYrF z0IzGP%_Qp2vn8~0)S_J?5tj)(Mj)rGX#mqNZNGgguF!i z7^Q{{1*c!lpLFO>1L}yNER2_!GX-is@7@p^FT|o2HGm6^X?v0a>ZpU0wkJ>n)3!p= zpj$0gOXxs9Aw6XkM_Mrou|dla1J~VZC}B0M=$#UhL=Q4biSn3qQ5MP;rdc9j1lpu& zXO}AxG$aGH_MoR(EyL}~#{N#vn?XTv7$?P|eO33WT)2q-y4^uB4%{VaI86YC9l5Pso27FP4Yn&AIii&8{aN5EQ;)%u@K; zwVxQinGiQ9F(7Pk+8T{ukyJoO4uq&&Q6t$=BU$;zEoI|yn5l?QhYG2hv=orkm1UAJ zfMaY|c6KnThSXJ;n3!1&(>SzFLmFJBuEEoL?Ik%Oorz#m%jazhhJ_o7@N8U~EzjrE z@E>fcZ#snPLbx$C;|kKGIt?e9Zd(_d(Na+G$kxN_CQ)THA-9E0-iEZ4YD5K&+zJyu zaEnBQN~)g1CGO{cwzY@l^N+VSj+2WSZe5qUA+arMk+}Sz&CKJD89)7clubqeAV&W_KGi~|f&qDfEA&qj9PXrjb?UoEBNg)hXObqWPY^?xI(mryt}=)uqjQa)UAC5#_tG8=w0B+_WB(D9pHBh{Og(Twbw9( zsub-^=6BmZWQ(;g(?Hz(blYE7#&m5UeoXs&LQ|1~GGH>}y8UTZ+E$6bgIaHVtbas1 zUO7|cRO%!ApLd4+4V7C*;`%`RKCm}5GoBY_mZq~_P-?yRaeVU~0M3s38Yj4oh zuXy$T2UrbH@81mgv;AEaVdi3f<^Xg7dPVCOkouLp_drD?!W34GIS-Rr4GXgpj03F1 z>R?5zMCMN(=#f7N@J|nv!@^|;o1<+B+nENul~5X_Y$Bi7(~`l4kk$=r%3;{(_BG^` zlRtj2sso?%@Hvj9WSmT8O2UUao2nqL6%p5V-GyMZPsj-+DGkymzFBGqDk?jLs$uQ7 zp(b`bf9FOypA7gdcHKS@KRSG$+NUY};9YMqH;*2w>FXpT947EQfmtW?ViieE*H$Gz>o#_cgY1zb7lO8{fyaoRp%2>^5xEvFjdOho#52I01)+R=cxZL1Z;fA zk*Wq#NfWOAU+Re%{)BpY#OWd6B_Pys^FKe*&it8okG#OxH2-o}J1ge@+I54K@fW+x z*^~U8?uO7wT{iF}R2eE#>LcT+GW>OtSM;p(13}Ldn!I8#?Fzrz^IKNJAGu^vJGAeoJ@G|yY!LdH`+u`SBe|868>$L0|iEC^53nMk`L;jPIr<59S zpK0dj)Jp#K6W!7IJ0saXTJI_W5nSsDx0Ej#ZE{4>jpk_)-ZL87nO&SFZv*J`43ET* z^$hi$zJEkhHGH~~w0Z&!1R4pfATV#U<-q+$7_OW*K+$VcB?a2Av38vmA&0P&B&PkE zU)IXm+x)MzExwc?rSR}bqr#tNUHkc`+NxSvROVMy2P$*S!PQ^Wt*TRZi08aJc&zMp7w!do{YB-4r-$wy^x&6=pO-2*?IXjH@)>z_q(`50d2dRQhUUCON$P(9T(`yVJAUGH_5}Tq5N@qcz3*sW&+)$AIDhR_py9uXc9?)r z_ZanlPe9~@)71MP0wPCz$}Ojx-S5*_#8iy$IK4<|BxT$9=;_5Zex&xW>|qhvg$?_cJkq~UG5OgiETf?-#XjGPGl70ZOc+Cp~XoQm*NO9pC#}vz;*jQNa>W4o!t81 zE9{q6g$S*fdSwLe zrXi&?k&85u0O+8KgQ;z&-cAB95#2}pwW*hDt*$ax$YFOl9L(YLFqhZW>{;ts3&LBJ&nT}ZY13{`k|t>rpWW?oqi)h};&oFu+ip(1-QDh;LrW?Px^nF{>_wMI#a8mn*=`-{DNcl zn3lfH9Ofpv!KH*;hLgLD^htHb!XiACwjEJt%)v3BHUec~6NWRQ$#FBg$6hfPxzeoDLrN$wqHR*{<)M6@#`70Uumr0TCQ#rZcaa$uw&G9C zz&4`wuDLgN_EMd+p7&&7;`g`3$D?)i-OfK$1?_KbWgsO@*M;;EpmxK-g!XtzO|%CoQ^ zv4(-T8yI=e!gAaNZ6~%f8jBj`!xh7o5kE#E<45F@x5(VGG?0BVjq5*clN_i=G zbT@dk6kLz31!Ud_6I~drT(rB~;WDn$J#f}eY+Z`HW;Ui6D>?`~iDRSI9>r)tr-m?A8mAOce1IN&zy=%->~{%x z*hKG@VLNZy5(d5n(@ZcLRv{c#ORK6tLDTS>=vt#074oV<6>QDa9iZT|r~nt!H!YaB zgl+u)=7NT*NisJ-{ZI9xPM$g3!po?c_JyZaSmG=!aY)*TCbSN(;|jPno0s!tyqMd$ zRjOzm{tWd_F9~Hv6tMgX&y23;9;vPp`mbT%*noY|)(msY9umuSP*z^5eQY-_vo|<{AuQTozhGRrM1{9ze(k@YvXwk^UBfj_?S=7ZIKn2O4TsO%-Pw{;}O` zBMy>D9BL)epfv*`#||7bP&JFVX(SVPAfY-?LbZ`JkVrR)(~Z9+zY`OyFK50C`aQO| z#c%jVgDe|Yi;9}2b>w3)+|**Z0gNnhwJDF>5U(}S;wTDSL0Exs4Zve!2a%RLc~%_V zo{{}D^vl4dK^qVXaVROOov^3H_?kyjtC4S(eBjwc;o6_upM&ly;Irq2t-1KFKSCF@ zz6uTdAwml>RF;c$5&=MmF*{NPK*g;CBY~hVXe>wF@KY2sbHoD8u-Dg6(gBhl3+qGr zmN?V0!g3wRM)94NGIBk3qa{@%FNoW1XHw$U?fxJH+fc8Euy2cV8w<&|#np}Z;eW!Z zKL&_vgZ_~b-OGLi9Zv@P6%PCwVJVvP0#N&H174$VATSbSImmMX#La%)2=2E5CPwgZ z_uKT5u|&5(%8Yo8VB9Z)Hu8vwz+yS^k zjCXj*SHxo-dE`y;Qb!|1$R{1^TTVccQWMI6A|;*HhMZ8O%xg8X-h|d@8k9Hb@aCjL zn$FPcd8RB6_#nr-C3YnUiV%t!YwQt&XS&W0BdRa&L_-vu?8aB5`T_ktX^ zC3vB1$e|`$2N@;<+U3w@LsgmRr^>z)`YRx5!PCT{Gz%pg;>j(Jbt}O~IXnjj>*d>+ z0}E6`31MNY4VA9qIde%hc{rEn&>C7r8!)4BCrI>)Pqr*8_%bKd4PLddhZB2-Vh^Rk zP}(R~ZY}bx#w<&l;7#dX*p7wW_zTh}w;r$7Sf^0}WBIVwnrTxgnbyLV*20#Op@2$( zs%k!YHplX|C9CGYfq)U;>ueD}FD)Tg#M#b=$m61DyI;JxV~e=9y+o9DZxI{rDG?W- zy$bDrbr*_T_iQ3h3D1s;mAewTPA1Ar$i%zBUKD@5v!UoE zB>xlPNq{a7iQ8qkAM=ik``CBH$2&{Ob>ZyENy2N!IuPCx4L#N5ZE>h)sYl-GO1xbE zibFjJQG}l&{4>IP2=9x_J>}tFAlZhn5h0NVkdnLj4N`JvJCNFg@G-)jtKE&=-y+D3 z{5w)|U+*A=uK@c5;ZX#cz-2PTv=~>ZDaif<)1v)N=%2$Bl`Bs4o-#q!7Pourou5w6 z6}#>wZ^pi`tJX>`#$pF`lO`>7ii$%8G z2!YUw@O^R0n@^(RVecXGq4;lak0yQef!Nt!snwwieUNWIci2$G@B=3!ki_*tr9;TEOUNJN+b=r!qwRdd>E-mKjP_- zT2ap-$aWdDjMhx6kPzTnM_scDxWdGC!zWf9ShifI;N4IQXF$Ttf=bCmYaub&=?b2e zNJYBMuOzs7+B^m0E9Fj-xS2a3QP{XilLc0N?5UyM7Lt5vbWF>_GhHfK9k(6v`NpV!G#FnJ^o@C$*9b5M`^ASF)y1trgTX@@j3+T49Ut)Z4F~)> zJAe`-ZtLpo?%UnAvxglJ-#A)GJmQ_Bw_N+ta+FBC@A$N{06??&2exkjmu2_j=x0%h zIj)V5QLo`kY_AUo4&i{BC`XQ*^bTr;*67C=?(Mn7Xc4Et~&^Rl3O&fPxY z8edZ<^TJcOHfT~<6@pA-GA+w&wGDY&5oSbqJV3g|Z^jFVO(Y-7@%#?in3mXw2vLN0 z5gtaEMR*9|XQJzvhX`@vSkrvYc=njP?jszN$?+CaGO^06c{lD$RzcowOz?TkuJ7^!TB)+ViZEN8AtG8 z@n~#|I6JY{;s;Hq#1AG4h+o{Ccvstknx7G`o;pk(6?J?^lU(XJa$BTEZ_qe7=95a8 zOv4n_S>o1{eSRH30i*_9Jj1I}Cvch!m|>tUihtw9WEkROLn5_1Hvsob5DSTVmaq*l zwo-g4lFMM$p^hI z_E!%)Mo7JAI=wwR@!H_TRRD0K8j$|;hvz1!f`{jqr^_=}!#I2(!5y0a5@MIcvZ=P_ zCY*>*rKc$2k?*7NYms{jsb-|)5y45%1>jPoTEyk4;zR3j_%jIW5zr@zNX4x%{JLOG z2x|~J5jNrgo(7``T?p;Svm=#w`jFa$yv?F0k~JUDEs?tu*Qo=CQ`|YxwW-O$?(F!V?RKs0uAuh zrBmMt_O^KMTw&5pWVa!FEVT3PrEef9@A4Q@lepxsM9cXN+FQ^!S;XhgZz6ArADyod zKRa&~x6d>3x)_@NW=a%>;~IQ<(!eNd5ak!P?E5aTm~_3o@i(yhG=dGa;^z^&j@?S6 zWIosd6n-L)jQ9o&KYWp}ZXA;lA3#dpHNSZMg1boG0F2_q0m+Dy2lRR5y(P2<1C7*{ qYSV0H_+{AI?O~-``J){cc9@_IE4A=-n&FV&v9?T`3jb!{@A*GrCI`U) delta 5353 zcmZ`-4OH9Lb^pJ=J`oZULVO4W2F#bhU}76%8ytUt6O2D=Fpi0BA@l@D2uXSR`zwBm zdWKJs9pjL`EM1e>hSRRynx&nsblnoKS-Lh&+^#1lIqpt1uS>i&OMJ2{Te8z7r#aod zPuRihW{!UU`|iE(zOQ@lyU$<#y87))YQcV+&7#1+&S!q$eMu|*VFrF^@nsxN3t?m!Yg53&SVv@9JhnDnr?*`!nY5;c3y(*vQ)XY(K@2w-x7)Jd0XS45>+1nJB!g^V(+Dmbd#LTtJIXN zk5+5SJV_fDNV0y$I7a9J+=$eYooI-zxw9ZMGh-+1o}3((rY0MrE;zOav{ks!+;LIU zfTFF&y-cEuteH{3@Ig9!+6=Kz4i9g%$LS~kbB2en5!#l1p|lB01g)IT{6Y1|6gCQ;GHIm{hfE)MVs6e-oWdqo*quksxZV^ z7~+VWBTk_!+|3o()>2;2Yk4)daFaCA72ZgFvw2BZTA9mZ9`2gImp4gsjnLkVZEX{_ z!P*D}LL)4qaZFuwY&YmkfW(jEpze$T_9`_>1E&8jj_>iFra140lyX;kFk@PH=EugBqd&I zK1Z%(`dVBnc{y{qbp;_8GH2WBiTWuKmskFW`Z*EbXn&b($^< zVAXd?vv_v(A@UvZ>FRXLaACwh8jSdY(Ljs_ePa>-c-E@<*?3U%9g7A0qihzt?*kBj zzNV(?c_4LhToC`hrn%*tNIr<*0q~mHA0z#j2vLNuAv}%n zg6QpNRX-pi(eaj7)$3fusx}cLaS=VS5(~8S#7tH!ava13ZM`bato@jLERvlUT`z$Z zuQ_W7X~C%kD@B`^#L9IW$t^LpuFG%}7)J5@x(aeryuOasTtbBfgm#3V19)}p5YlqK zFNlZgi%Ks;yBIbtVFp4WR;56-5O!IRT9-Jt{!x1c%JfnOe1oXj@V4c9&|D6D_7@_* ztLDDnLK7_i2tMqm2wlifSvk^X0)Sh?Oh{D%l{Ez-u|zPTEkoPzQxrYR%djppasC)J z^FY&UU_PYZ6X&|y4cCFJ7q4{Jlk1s(?Y0}q_r-6vo+%hM?FuClq3HOam$0kid`}g* zDxUB0_}|2?ZvkYDW1&bS>SsTLhPQ~lkAe>noH(8D12v2;=L^Il3091}5`e5e6x9;L zX26^Tk8#)>jl^@!a?#xDQ9oBiU+*AUCoc2`NSz?tD%H;nnd)sXs#|G8vXEOKS?l1^ zcWHV~Gy~l1I@Zr^oWQ+pr%jML)x>9-;Wh_uato9YUf}DPORVT?1=GWT9+B?zRz924 zq%HE^x8J4xqWF1V2YFZMw{J>K!2P}=Sp@gH^Uk<>)&{&%xKMB=CRxl3!Igl4yaLA4 zv)xi|&he6^kehV06(Y9?zHRU=g@P#8w##-Qv{ynhfRxXn#z@*UH>|@xt!_=)cmZv9 zE6Flm21@tHyQd5WYJrZIWMCtXfs`n8G{#s#w~HAe|cp$C$t*2 z&%oMp8w7O|YQu!-8ga3InYRP87+niFd+(iP8Q8=BhxWVu$6E~SrdA6w4~*40t4rEw z11xPFEX@XmNdXiqx0BQQOnQe+CC|zja5OZFvHDu_74e&0Pm(9a$e#PfhK5@5hmzX3 zrjL+pNp zYvP-OEt~!h$gII1k4H{sbyNNmxtt_J7-|WH{WGc#Ajn+wOQf<0|A25qIQBQBevRZ7 zgsljsMu6qws2M}%{d=bZcvH1~#T7q2d>PXvQBYY9#H$Ai8^=;0t-7Z0V|$rWiA6X;{` z_<#hejz-86A~rhdxB*P|6@+etABtO}9&$;T0wd%dF%cLr%Kdv=yb@?KHskjGS`?lw z5-J@b7es*ekhjGL44%PwBVF4v`3Ul8MZ7!6!)}VIOhBd>-T%=cI8CT4L{b zX{wx7$l}1NS*RzySV=^yxOLt(ql4<9a*4d^j@%5|@a}+cHPdPcbO(f=BJ-P?DCgF2 z5qHo$S_74j6*L^60l5$;69k)))=nDcRJN1nhfPOlty`hXCkZk%7q?2IdRhlvoLVt2 zgsf4EX_7t73#kscgy&=WOcrq`t%vNOvJb&-_znQ zc)oa&#f#d-_(bAnUQC-x6?n*~552KyTL-U*GG0V$sdrX|Ow-JD^X>%&wok|Ig(ndt zVwpU3iF=`lErNoy1WFtoZHA0zp{=|Gb6h)7qUMm2V@DmgDNtyWi_PbHZiN(NUaX8( zX4*qT24WF2(YUb)kJ%|cj}Ce+f=_t$xj>pn|5!k{2jS~tD%Ples1r}cj^}@I>(7g_ zcqdWB-uPNFCuZWUWLSJJUQOJYU&Nmy#3lZKjn}>e2D0YKU@%TYQ;BR|A{h5GzZPTc zcOZdV{gTK}lskuDB9JT0yLayO4Q<&sz=lL;qKeqWP~w(*Kh6?0a`!x5@eISDN*vS< z_TdQZ0hAs@Bl@iIXq-Zgms`p(3LZp(nk!8*IINnr;7C1tcJAM`n;k-(A@P}3PTZpS z=oZm-tfTr0>JfGrWgZ+t4vT@+2-OM%)*D9;EL9x&eZ`iX;`Fh8@xigqHW|Zzf~G)K zVfcRpOlBRKXJvBfN8WaX$HkY9$H-3c%JE8K5dU<%%=`dmvM3Sk!E`W_2j_@fQpO1 zeWZp2pagY^BdJPJlCtOWX<;`=Ps3^Eib=w{7j;&E9CPBul-c&^68Ui{|4FLGcpOKN zNx5(%km)5#8#B%??>8Fdh1;B&dh`+@ZNiq`RVp0?lRaAr0FDbWj%RXre*baKyIxK= zxK@E2-WkCYP)z1-ANGX!G`(fRYV3%Ym3JAtfjjy`gbw7MLTW8ia_8}Xs&YP6j8v!4 z&eW8zNAU)PE~$v{4vII5@17~^zgJ4IO%Zk@Y(ky|soZUX)Mn&u0q_=O3*;OLZloGo3vc`lr_lgGsKXHSt+8Si7GguEusox4hy*gm^y z@JBe3T*`W+HXyu;@DqdvFa{5sC=H%qvZ~NUVgV?^p|)o>P{f+aslyMqIisQ827lr3H8?Qn8rd}L4-%GBE%jezVJLikVweyU;DmFfTxd0QLH1X41VM#Y7>>SfaF(UBSYE0eB=PRC9mToX6uVjY8KyV+>g!>`EPV@WB!%I_@9 bN~}8grj#alCl*=Zl?XqVS%p76@DKbCG@+fT diff --git a/deep_autoviml/data_load/classify_features.py b/deep_autoviml/data_load/classify_features.py index da64635..e451e40 100644 --- a/deep_autoviml/data_load/classify_features.py +++ b/deep_autoviml/data_load/classify_features.py @@ -217,7 +217,7 @@ def classify_columns(df_preds, model_options={}, verbose=0): #### If there are 30 chars are more in a discrete_string_var, it is then considered an NLP variable ### if a variable has more than this many chars, it will be treated like a NLP variable - max_nlp_char_size = check_model_options(model_options, "nlp_char_limit", 30) + max_nlp_char_size = check_model_options(model_options, "nlp_char_limit", 50) ### if a variable has more than this limit, it will not be treated like a cat variable # #### Cat_Limit defines the max number of categories a column can have to be called a categorical colum cat_limit = check_model_options(model_options, "variable_cat_limit", 30) @@ -502,7 +502,7 @@ def classify_features_using_pandas(data_sample, target, model_options={}, verbos nlps = [] bools = [] ### if a variable has more than this many chars, it will be treated like a NLP variable - nlp_char_limit = check_model_options(model_options, "nlp_char_limit", 30) + nlp_char_limit = check_model_options(model_options, "nlp_char_limit", 50) ### if a variable has more than this limit, it will not be treated like a cat variable # cat_limit = check_model_options(model_options, "variable_cat_limit", 30) ### Classify features using the previously define function ############# @@ -540,7 +540,7 @@ def classify_features_using_pandas(data_sample, target, model_options={}, verbos floats = [] preds_copy = copy.deepcopy(preds) for key in preds_copy: - if data_sample[key].dtype in ['object'] or str(data_sample[key].dtype) == 'category': + if str(data_sample[key].dtype) in ['object', 'category']: if type('str') in data_sample[key].map(type).value_counts().index: feats_max_min[key]["dtype"] = "string" elif data_sample[key].map(type).value_counts().index[0] == int: @@ -574,7 +574,7 @@ def classify_features_using_pandas(data_sample, target, model_options={}, verbos discrete_strings.remove(key) var_df1['discrete_string_vars'] = copy.deepcopy(discrete_strings) #### This is not a mistake - you have to test it again. That way we make sure type is safe - if data_sample[key].dtype in ['object'] or str(data_sample[key].dtype) == 'category': + if str(data_sample[key].dtype) in ['object', 'category']: if data_sample[key].map(type).value_counts().index[0] == object or data_sample[key].map(type).value_counts().index[0] == str: feats_max_min[key]["dtype"] = "string" elif data_sample[key].dtype in ['bool']: @@ -627,10 +627,11 @@ def classify_features_using_pandas(data_sample, target, model_options={}, verbos feats_max_min[key]["vocab"] = vocab feats_max_min[key]['size_of_vocab'] = len(vocab) elif feats_max_min[key]['dtype'] in ['string']: - data_types = len(data_sample[key].fillna("missing").map(type).value_counts()) + data_sample[[key]] = data_sample[[key]].fillna("missing") + data_types = len(data_sample[key].map(type).value_counts()) if data_types > 1: print('\nDATA CLEANING ALERT: Dropping %s since it has %s mixed data types.' %(key, data_types)) - print(' Transform variable to single data type and re-run. Continuing...') + print(' Convert this variable to a single data type and re-run deep_autoviml.') ignore_variables.append(key) preds.remove(key) feats_max_min['predictors_in_train'] = preds @@ -642,7 +643,7 @@ def classify_features_using_pandas(data_sample, target, model_options={}, verbos discrete_strings.remove(key) var_df1['discrete_string_vars'] = copy.deepcopy(discrete_strings) if not key in ignore_variables: - if np.mean(data_sample[key].fillna("missing").map(len)) >= nlp_char_limit: + if np.max(data_sample[key].map(len)) >= nlp_char_limit: ### This is for NLP variables. You want to remove duplicates ##### if key in dates: continue @@ -652,7 +653,7 @@ def classify_features_using_pandas(data_sample, target, model_options={}, verbos elif key in discrete_strings: discrete_strings.remove(key) var_df1['discrete_string_vars'] = discrete_strings - print('%s is detected and will be treated as an NLP variable' %key) + print('%s is detected as an NLP variable' %key) if key not in var_df1['nlp_vars']: var_df1['nlp_vars'].append(key) #### Now let's calculate some statistics on this NLP variable ### @@ -663,14 +664,14 @@ def classify_features_using_pandas(data_sample, target, model_options={}, verbos ### Immediately cap the vocab size to 300,000 - don't measure its vocab!! data_sample = data_sample.sample(frac=0.1, random_state=0) try: - vocab = np.concatenate(data_sample[key].fillna('missing').map(tokenize_fast)) + vocab = np.concatenate(data_sample[key].map(tokenize_fast)) except: - vocab = np.concatenate(data_sample[key].fillna('missing').map(tokenize_fast).values) + vocab = np.concatenate(data_sample[key].map(tokenize_fast).values) vocab = np.unique(vocab).tolist() feats_max_min[key]["vocab"] = vocab try: - feats_max_min[key]['seq_length'] = int(data_sample[key].fillna("missing").map(len).max()) - num_words_in_each_row = data_sample[key].fillna("missing").map(lambda x: len(x.split(" "))).mean() + feats_max_min[key]['seq_length'] = int(data_sample[key].map(len).max()) + num_words_in_each_row = data_sample[key].map(lambda x: len(x.split(" "))).mean() feats_max_min[key]['size_of_vocab'] = int(num_rows_in_data*num_words_in_each_row) except: feats_max_min[key]['seq_length'] = len(vocab) // num_rows_in_data @@ -679,10 +680,8 @@ def classify_features_using_pandas(data_sample, target, model_options={}, verbos ### This is for string variables ######## #### Now we select features if they are present in the data set ### num_rows_in_data = model_options['DS_LEN'] - if data_sample[key].isnull().sum() > 0: - vocab = data_sample[key].fillna("missing").unique().tolist() - else: - vocab = data_sample[key].unique().tolist() + data_sample[[key]] = data_sample[[key]].fillna("missing") + vocab = data_sample[key].unique().tolist() vocab = np.unique(vocab).tolist() #vocab = ['missing' if type(x) != str else x for x in vocab] feats_max_min[key]["vocab"] = vocab @@ -748,7 +747,7 @@ def classify_features_using_pandas(data_sample, target, model_options={}, verbos print('Not performing feature crossing for categorical nor integer variables' ) return data_sample, var_df1, feats_max_min ############################################################################################ -def EDA_classify_and_return_cols_by_type(df1, nlp_char_limit=20): +def EDA_classify_and_return_cols_by_type(df1, nlp_char_limit=50): """ EDA stands for Exploratory data analysis. This function performs EDA - hence the name ######################################################################################## @@ -763,7 +762,8 @@ def EDA_classify_and_return_cols_by_type(df1, nlp_char_limit=20): nlpcols = [] for each_cat in cats: try: - if df1[each_cat].map(len).mean() >=nlp_char_limit: + df1[[each_cat]] = df1[[each_cat]].fillna('missing') + if df1[each_cat].map(len).max() >=nlp_char_limit: nlpcols.append(each_cat) catcols.remove(each_cat) except: @@ -775,7 +775,7 @@ def EDA_classify_and_return_cols_by_type(df1, nlp_char_limit=20): floatcols = df1.select_dtypes(include='float').columns.tolist() return catcols, int_cats, intcols, floatcols, nlpcols ############################################################################################ -def EDA_classify_features(train, target, idcols, nlp_char_limit=20): +def EDA_classify_features(train, target, idcols, nlp_char_limit=50): ### Test Labeler is a very important dictionary that will help transform test data same as train #### test_labeler = defaultdict(list) @@ -1081,7 +1081,7 @@ def classify_dtypes_using_TF2(data_sample, preds, idcols, verbose=0): """ print_features = False nlps = [] - nlp_char_limit = 30 + nlp_char_limit = 50 all_ints = [] floats = [] cats = [] @@ -1108,7 +1108,8 @@ def classify_dtypes_using_TF2(data_sample, preds, idcols, verbose=0): int_vocab = tf.unique(value)[0].numpy().tolist() feats_max_min[key]['size_of_vocab'] = len(int_vocab) elif feats_max_min[key]['dtype'] in [tf.string]: - if tf.reduce_mean(tf.strings.length(feature_batch[key])).numpy() >= nlp_char_limit: + feature_batch[[key]] = feature_batch[[key]].fillna("missing") + if tf.reduce_max(tf.strings.length(feature_batch[key])).numpy() >= nlp_char_limit: print('%s is detected and will be treated as an NLP variable') nlps.append(key) else: diff --git a/deep_autoviml/data_load/extract.py b/deep_autoviml/data_load/extract.py index 9a2b81d..6c576b2 100644 --- a/deep_autoviml/data_load/extract.py +++ b/deep_autoviml/data_load/extract.py @@ -127,7 +127,7 @@ def transform_train_target(train_target, target, modeltype, model_label, cat_voc train_target = copy.deepcopy(train_target) cat_vocab_dict = copy.deepcopy(cat_vocab_dict) ### Just have to change the target from string to Numeric in entire dataframe! ### - + if modeltype != 'Regression': if model_label == 'Multi_Label': target_copy = copy.deepcopy(target) @@ -310,34 +310,50 @@ def load_train_data_file(train_datafile, target, keras_options, model_options, v ### if modeltype is given, then do not find the model type using this function _, _, usecols = find_problem_type(train_small, target, model_options, verbose) - label_encode_flag = False + ########## Find small details about the data to help create the right model ### - - if modeltype == 'Classification' or modeltype == 'Multi_Classification': + label_encode_flag = model_options["label_encode_flag"] + if isinstance(label_encode_flag, str): + if modeltype == 'Classification' or modeltype == 'Multi_Classification': + if isinstance(target, str): + #### This is for Single-Label problems ######## + if train_small[target].dtype == 'object' or str(train_small[target].dtype).lower() == 'category': + label_encode_flag = True + elif 0 not in np.unique(train_small[target]): + label_encode_flag = False + if verbose: + print(' label encoding must be done since there is no zero class!') + target_vocab = train_small[target].unique() + num_classes = len(target_vocab) + elif isinstance(target, list): + #### This is for Multi-Label problems ######## + num_classes = [] + for each_target in target: + if train_small[each_target].dtype == 'object' or str(train_small[target[0]].dtype).lower() == 'category': + label_encode_flag = True + elif 0 not in np.unique(train_small[each_target]): + label_encode_flag = False + if verbose: + print(' label encoding must be done since there is no zero class!') + target_vocab = train_small[each_target].unique().tolist() + num_classes.append(len(target_vocab)) + else: + num_classes = 1 + target_vocab = [] + label_encode_flag = False + else: if isinstance(target, str): - #### This is for Single-Label problems ######## - if train_small[target].dtype == 'object' or str(train_small[target].dtype).lower() == 'category': - label_encode_flag = True - elif 0 not in np.unique(train_small[target]): - label_encode_flag = True ### label encoding must be done since no zero class! target_vocab = train_small[target].unique() num_classes = len(target_vocab) - elif isinstance(target, list): - #### This is for Multi-Label problems ######## - num_classes = [] - for each_target in target: - if train_small[each_target].dtype == 'object' or str(train_small[target[0]].dtype).lower() == 'category': - label_encode_flag = True - elif 0 not in np.unique(train_small[each_target]): - label_encode_flag = True - target_vocab = train_small[each_target].unique().tolist() - num_classes.append(len(target_vocab)) - else: - num_classes = 1 - target_vocab = [] + else: + for each_target in copy_target: + target_vocab = train_small[target].unique().tolist() + num_classes_each = len(target_vocab) + num_classes.append(int(num_classes_each)) + #### This is where we set the model_options for num_classes and num_labels ######### model_options['num_classes'] = num_classes - + ############# Sample Data classifying features into variaous types ################## print('Loaded a small data sample of size = %s into pandas dataframe to analyze...' %(train_small.shape,)) ### classify variables using the small dataframe ## @@ -727,37 +743,49 @@ def load_train_data_frame(train_dataframe, target, keras_options, model_options, cat_vocab_dict['modeltype'] = modeltype model_options['batch_size'] = batch_size ########## Find small details about the data to help create the right model ### - target_transformed = False - if modeltype != 'Regression': - if isinstance(target, str): - #### This is for Single Label Problems ###### - if train_small[target].dtype == 'object' or str(train_small[target].dtype).lower() == 'category': - target_transformed = True - target_vocab = train_small[target].unique() - num_classes = len(target_vocab) - else: - if 0 not in np.unique(train_small[target]): - target_transformed = True ### label encoding must be done since no zero class! - target_vocab = train_small[target].unique() - num_classes = len(train_small[target].value_counts()) - elif isinstance(target, list): - #### This is for Multi-Label Problems ####### - copy_target = copy.deepcopy(target) - num_classes = [] - for each_target in copy_target: - if train_small[target[0]].dtype == 'object' or str(train_small[target[0]].dtype).lower() == 'category': + target_transformed = model_options["label_encode_flag"] + if isinstance(target_transformed, str): + if modeltype != 'Regression': + if isinstance(target, str): + #### This is for Single Label Problems ###### + if train_small[target].dtype == 'object' or str(train_small[target].dtype).lower() == 'category': target_transformed = True - target_vocab = train_small[target].unique().tolist() - num_classes_each = len(target_vocab) + target_vocab = train_small[target].unique() + num_classes = len(target_vocab) else: - if 0 not in np.unique(train_small[target[0]]): + if 0 not in np.unique(train_small[target]): target_transformed = True ### label encoding must be done since no zero class! - target_vocab = train_small[target[0]].unique() - num_classes_each = train_small[target].apply(np.unique).apply(len).max() - num_classes.append(int(num_classes_each)) + target_vocab = train_small[target].unique() + num_classes = len(train_small[target].value_counts()) + elif isinstance(target, list): + #### This is for Multi-Label Problems ####### + copy_target = copy.deepcopy(target) + num_classes = [] + for each_target in copy_target: + if train_small[target[0]].dtype == 'object' or str(train_small[target[0]].dtype).lower() == 'category': + target_transformed = True + target_vocab = train_small[target].unique().tolist() + num_classes_each = len(target_vocab) + else: + if 0 not in np.unique(train_small[target[0]]): + target_transformed = True ### label encoding must be done since no zero class! + target_vocab = train_small[target[0]].unique() + num_classes_each = train_small[target].apply(np.unique).apply(len).max() + num_classes.append(int(num_classes_each)) + else: + num_classes = 1 + target_vocab = [] + target_transformed = False else: - num_classes = 1 - target_vocab = [] + if isinstance(target, str): + target_vocab = train_small[target].unique() + num_classes = len(target_vocab) + else: + for each_target in copy_target: + target_vocab = train_small[target].unique().tolist() + num_classes_each = len(target_vocab) + num_classes.append(int(num_classes_each)) + ########### find the number of labels in data #### if isinstance(target, str): num_labels = 1 @@ -772,7 +800,7 @@ def load_train_data_frame(train_dataframe, target, keras_options, model_options, cat_vocab_dict['num_labels'] = num_labels cat_vocab_dict['num_classes'] = num_classes cat_vocab_dict["target_transformed"] = target_transformed - + #### once the dataframe has been classified, you can again change train_small to original dataframe ## train_small = copy.deepcopy(train_dataframe) @@ -1054,18 +1082,23 @@ def combine_nlp_text(features): y[NLP_COLUMN] = tf.strings.reduce_join([features[i] for i in NLP_VARS],axis=0, keepdims=False, separator=' ') return y - ################################################################ + ###################################################################################### ### You have to load only the NLP or text variables into dataset. ### otherwise, it will fail during predict. Yo still need to create input for them. ### In mixed_NLP models, you drop original NLP vars and combine them into one NLP var. - if NLP_VARS and keras_model_type.lower() in ['nlp','text']: + ###################################################################################### + + if NLP_VARS and keras_model_type.lower() in ['nlp','text', 'mixed_nlp', 'combined_nlp']: if keras_model_type.lower() in ['nlp', 'text']: train_ds = train_ds.map(lambda x, y: (process_NLP_features(x), y)) #train_ds = train_ds.unbatch().batch(batch_size) print(' processed NLP or text vars: %s successfully' %NLP_VARS) - else: + elif keras_model_type.lower() in ['combined_nlp']: train_ds = train_ds.map(lambda x, y: (combine_nlp_text(x), y)) print(' combined NLP or text vars: %s into a single feature successfully' %NLP_VARS) + else: + ### Mixed NLP is to keep NLP vars separate so they can be processed individually ## + print(' keeping NLP vars separate') else: print(' No special text preprocessing done for NLP vars.') ############################################################################ diff --git a/deep_autoviml/deep_autoviml.py b/deep_autoviml/deep_autoviml.py index f5261b1..3b650f3 100644 --- a/deep_autoviml/deep_autoviml.py +++ b/deep_autoviml/deep_autoviml.py @@ -193,6 +193,11 @@ def fit(train_data_or_file, target, keras_model_type="basic", project_name="deep Another option would be to inform autoviml about encoding in CSV file for it to read such as 'latin-1' by setting {"csv_encoding": 'latin-1'} Other examples: + "nlp_char_limit": default 50. Beyond this max limit of chars in column, it + will be considered NLP column and treated as such. + "variable_cat_limit": default 30. if a variable has more than this limit, it + will NOT be treated as a categorical variable. + "DS_LEN": default "". Number of rows in dataset. You can leave it "" to calculate automatically. "csv_encoding": default='utf-8'. You can change to 'latin-1', 'iso-8859-1', 'cp1252', etc. "cat_feat_cross_flag": if you want to cross categorical features such as A*B, B*C... "sep" : default = "," comma but you can override it. Separator used in read_csv. @@ -205,6 +210,9 @@ def fit(train_data_or_file, target, keras_model_type="basic", project_name="deep We will figure out single label or multi-label problem based on your target being string or list. "header": default = 0 ### this is the header row for pandas to read + "compression": None => you can set it to zip or other file compression formats if your data is compressed + "csv_encoding": default 'utf-8'. But you can set it to any other csv encoding format your data is in + "label_encode_flag": False. But you can set it to True if you want it encoded. "max_trials": default = 30 ## number of Storm Tuner trials ### Lower this for faster processing. "tuner": default = 'storm' ## Storm Tuner is the default tuner. Optuna is the other option. "embedding_size": default = 50 ## this is the NLP embedding size minimum @@ -260,7 +268,7 @@ def fit(train_data_or_file, target, keras_model_type="basic", project_name="deep print('Model and logs being saved in %s' %save_model_path) if keras_model_type.lower() in ['image', 'images', "image_classification"]: - ############### Now do special image processing here ################################### + ############### Now do special IMAGE processing here ################################### if 'image_directory' in model_options.keys(): print(' Image directory given as %s' %model_options['image_directory']) image_dir = model_options["image_directory"] @@ -287,7 +295,7 @@ def fit(train_data_or_file, target, keras_model_type="basic", project_name="deep print(deep_model.summary()) return deep_model, cat_vocab_dict elif keras_model_type.lower() in ['text', 'text classification', "text_classification"]: - ############### Now do special image processing here ################################### + ############### Now do special TEXT processing here ################################### text_alt = True ### This means you use the text directory option if 'text_directory' in model_options.keys(): print(' text directory given as %s' %model_options['text_directory']) @@ -363,8 +371,8 @@ def fit(train_data_or_file, target, keras_model_type="basic", project_name="deep print(' %s : %s' %(key, keras_options_copy[key])) keras_options[key] = keras_options_copy[key] - list_of_model_options = ["idcols","modeltype","sep","cat_feat_cross_flag", "model_use_case", - "nlp_char_limit", "variable_cat_limit", "csv_encoding", "header", + list_of_model_options = ["idcols","modeltype","sep","cat_feat_cross_flag", "model_use_case", "label_encode_flag", + "nlp_char_limit", "variable_cat_limit", "compression", "csv_encoding", "header", "max_trials","tuner", "embedding_size", "tf_hub_model", "image_directory", 'image_height', 'image_width', "image_channels", "save_model_path"] @@ -378,6 +386,8 @@ def fit(train_data_or_file, target, keras_model_type="basic", project_name="deep model_options_defaults["nlp_char_limit"] = 30 model_options_defaults["variable_cat_limit"] = 30 model_options_defaults["csv_encoding"] = 'utf-8' + model_options_defaults['compression'] = None ## is is needed in case to read Zip files + model_options_defaults["label_encode_flag"] = '' ## User can set it to True or False depending on their need. model_options_defaults["header"] = 0 ### this is the header row for pandas to read model_options_defaults["max_trials"] = 30 ## number of Storm Tuner trials ### model_options_defaults['tuner'] = 'storm' ## Storm Tuner is the default tuner. Optuna is the other option. @@ -446,7 +456,7 @@ def fit(train_data_or_file, target, keras_model_type="basic", project_name="deep #### There may be other use cases for model_use_case in future hence leave this empty for now # #### you must create a functional model here - print('\nCreating a new Functional model here...') + print('\nCreating a new Functional keras model now...') print(''' ################################################################################# ########### C R E A T I N G A K E R A S M O D E L ############ @@ -483,7 +493,7 @@ def fit(train_data_or_file, target, keras_model_type="basic", project_name="deep keras_options, model_options, var_df, cat_vocab_dict, project_name, save_model_flag, verbose) else: #### This is used only for custom auto models and is out of the strategy scope ####### - print('Building and training an automatic model using %s Tuner...' %model_options['tuner']) + print('Building and training a(n) %s model using %s Tuner...' %(keras_model_type, model_options['tuner'])) deep_model, cat_vocab_dict = train_custom_model(nlp_inputs, meta_inputs, meta_outputs, nlp_outputs, batched_data, target, keras_model_type, keras_options, model_options, var_df, cat_vocab_dict, project_name, @@ -509,6 +519,6 @@ def fit(train_data_or_file, target, keras_model_type="basic", project_name="deep ############################################################################################ def get_save_folder(save_dir): - run_id = time.strftime("model_%Y_%m_%d-%H_%M_%S") + run_id = time.strftime("model_%Y_%m_%d_%H_%M_%S") return os.path.join(save_dir, run_id) ############################################################################################ \ No newline at end of file diff --git a/deep_autoviml/modeling/__pycache__/create_model.cpython-38.pyc b/deep_autoviml/modeling/__pycache__/create_model.cpython-38.pyc index 6c8996f61c3aeff6239dae1a2c77b3ed4c0354be..a3260f57ac2a28611eac4956febceca7ed905fce 100644 GIT binary patch delta 1216 zcmZ{jO>7fK6vy}N+K%nUi4!|suibnRC>ZLPgea68D%$X+76?!X5I)?Pv5jN1(DA4W z(pX4D>Y)7^3TL*+I!sN%+l;1r2NENJ9d(SzORSq z-R;mxMHktCqFWpED{@ld6{qNwx~sE|s>s7O$+3~+qnLBkt*n)t<}DFz4nSirZZLgL zue!^fV%3UUQ_YOiGmJeRI!exnV%Gqi6*s%aAt&BO%oY#bfJ*1Y>#ii67rl{0D&6e0 zgZ%T#7-qk1lB(}JXN7zRDz?xPD*j^j8|~z*xE1L}<)z3tR8iS-AAU!-Tjb2PUDHh^ zJI12(DZYpA&Fc%GTr$nv|6E4U#Y>+k7rE|bb0^#vzUTzJQmgmUyYITR@8^5DD{nZO z(_%Rqhm2T_nxPe}#KU!VOyh2HL6}xzGa*@$yMY~`p|gfn@9(Rx{3Y&dR9v&#(|Nh$ z{GdD$ti8y!hT=)h{uDHMAuX)?G+w%teCclJd6>gzJ+0aLsSo))ljoOOKR^2bs?8`z zP5HgQHh<&Hed|49E!1sJG*`fbq*!!S{gj|mN!*OZ;Zp5h><4$S?jqUjl{el@A>Suw zVyG_+Z^XX7#iO)_-&;bbJ{+d}<-(;arEHqmcEX?^A)>#_@SChLrri0$w9RO%M2ia1 z2XY>eL4Lca^Psk;e+6KvwrOD8z@hNTaov<2b$xbw-F<;xE|=?&uEQ+-Q@q^r8ro`4 z62~BJ_(9j!uQiw63V!foq#d;#+g6~b6VN1tq3*S_WtOFvzfS;_6kR*TP%<2mH+4@Y z%cj#q&NxgTiZjXG@Lb$Z#^HtdBe|tliqNnOaTz|A;V1bBF*&ouT5<%Qijl#c&?8EN Whv0GT;oy?t>5$nTadr4-IPf2Dh)*m4 delta 1216 zcmZ{je`s4(6vy9rFG-VU(q?wa%S)PewXRz%ZCqD2jj&aVtZeOmuj}HEG~G?pAKPnp zbCEi35LW0II0#(uFVRL}8~bN$8Pf^w50#<+_}>&CLni*?f0DoaN6>R`lERqgk#loD z=bZ03IrqK?bJe-fsZhw&@N@o?2l<<~Ly2%Ch`TZTmKZv?pMb5EP4U|nyEYYm-S8KK z@K@G=#FzAnhk7`aU*(>%!7LVAwD~R)A`_zCnTAQxyEO%e#i6Y` zKKPYu1%q$TWS||J7G_^Jk6Y8%<<7-az-pi`XDPrWDI5nSXP^lcB9f;qQ@G9x5cED6r<5# z`khw36XyHM8H{+hMJj;{%`W+FRBfa4sCwJ#H9AOEd>T!n`q$Ah_z=}?@4=_E2OuZj zw-U*j|IiviEkFH6X~_;BS$pAz2*r-TAF5A3z4nw=Qap^cCvz<&C(dFUX?}peS=6hb zI-U{rnAQ0hvkteSn8v*%FA{cY$1FKZ$W^0?)2sL|{Ebo1PBT#2htrI8JC%$5Y=q;e z%M%z_kR#_4@thI{Aq6IyDib}PQ_ zjnL=teRBP?3&k8M6jrEw#V(5MwtM5QM}m}21BkL_Y2p0Jxk8RO4kDREYiLq>)>~Z2 zJ4_{}40>03LejJ=H@#cS_dWr*S9_yx%)n*6|D!+=G$pmF_9p~MPw`MnfSNwGn>xCWhH zE)HCd{_^8rf9}WL6!gs&SU$JQZLGue3$Z#l3g3(Sg9-RSJR02DBQ@#s(%90(rTI+$ oqnP|8T@?;)`Vs4bnsCd z_&#%M$tr@zrJ99U)M;nNchin#07o=~Ict;2K|4jZ%)uORFz*uS(v5DzX(CtVVV=y# zeCa`t@htWVus{}Kq0uhZMOY+@u~>T1YrJ$(B1^GUmSLGJ$8uSL6%b22JVWe}l~^f# z=#y1gWt#sVp2-{Wqu=5Mc~QGveLL4(y5c#CxgOfm~o zn}Xt;oX2_Y3DE)e&Qm9KakZ(B!qm-UE@*P5?|?aVIqfcx+HPG`Db}$1Y(kU`@|vPCH>(zt0{4_^LD9^8nxU)!Y!kFa0L(1;B=y^}K|i z+Et*p!5{sxs#m@*5K`Bw{y;=6_}Ac&_Eq0chI{%${aJvQ+7~##4Vv^)%k31{ z(B9x%fPTFmdhUcleIp!B2B|*vt-}-bu>TRP=-|LwGOVlLM_%*v$44&#tg5WYbr5PP e64wJmVW;N~|FWr&3Tb>M>L#ZqxtwZc^vXZCvJU3} delta 1000 zcmYk4%TH8y6vyvzkoaJRhscZr!+^Zs2m>N8;D93nIy@8~$G4t~aQz*Ph^cAP%%W^u zG^zcitqV0{8mWC4gSb=#yp^TS2&;-2sKbI!fzm+!s*Jcj$r z;GgmPw>fe>SpVhX@?C!fdha&fN|QsN3bDxbc{?)2wDl2y;}(eCxuK=MO_gvEgW!nX zuJSpAA*=J$E)HXuOR<#8u*@>Qun0!D9Luc_h>v2FE3kqqvC^`9wVSK3imS1j(=}Mb zwODH#1*(qgv7Q^Sfg7>Wvi~nHl!G*3ljTLKnOm@hTd~#p9nx#VHmi$8x8okGcZ%-7 z4y#LKaVK_K9o*QJwk9M)F^pNgOLcKKc3T}*d$|XDtS(jixEFi55BsbyQ~eyrI1k_e z58|L@5w)KanBXBCvL4hh4!Z@J1CJa`y2;EzD!2J2y^+izSMFYV^h4ld-ne%-b3~%I z%1dRr1Rj+$XM>u+37*7B%PNFT;S^8fwDpy8{A?CS@u>B?rGE^MWplMb&)|&ZRl<+s zan5Fb0#9UFnyNRxkR1SZ5>MJcLe6dLDLh5PR7-VBfJpV!u;k!rYNRHqXGR=qrj{j# zTHKI#);ss956@F8wMp*+9iVpFBTrqV4(b$gN!}Mzm$UhN*tH3LH0n^7B6iUg*iCz7 zaF%*#pVTYVOMO!3(*dEFVz196Pg=VV9Z1t@8!T(=4Ig-{KED}a=1$jVx$xcu`(FZ#nDzK~K1i956NwzSrhiKQ2H)r}4?cv)CN{E?1HbFH z9E)8w+6Zek?!|Al?80kO24+2@)U)5+F$M1@QHg0Jy*`7Q5J8fU`eI zk($*?1Z9CzVM86srffx8*p3~mildr3Jx!cAsZUzlt)sZE>oglTmhILu)8sTK_3`OR z+qmDIe*uDu&S`hg-v7>>J9qBP+{euP`@+B0SH7kD&gbWQHGFKYPWuNpE{) z+me3nmG}Nnsy4RM*r{twV_TxTV!Msq3a^RoiS0G^#-1{sQsr8d_ZjWu)-UGA2E(7yeWmkfH4p|Y8+LB zW|WT^$6|xVpenbZJY)^6+3O5R(MDB zOzf<2Hg?W97aKK36|FNm7CUd8k6kb>=qVbx9=&K>3>Ky@t!l>Q;N`Tzwy)~C7Q7s} zlD)tXl<;{sEbV-(=leSSkMT@W{eqC*A*>|wHxiR zkP%YlZt%_;v#|!FA+|MTY>kDDa4cd(V)aISY|fZdOg+(x*k)rh@SSYeJ^;xUR;{4X$W=qluVlmw^1MDcCVo+np*x)^l4F+#6+KnU| zV#h&$77A6OgYCu+)qx%@Fbb5iDI?9kx zryKX!DRvs-USMa~Sv)_%^ii#K?BOYg)9%eV84S$bJ{LTB{^1$-2X_6c?8+?xJeoVK z56P9>rNSGq_JoaWUgse|`SsjRc_FW$m7%~(*FhPuj60l|G=mBA77A;P!t#@Oo%t6) zY1Mg{_?i4p-lqK98+alQ|1R(Ax*n5X_0{M}`M$4Je_sAi-)4PEmKPkET6HH2gqGG> zIjdlmA=|1gS*XhX^>UFaSFM+eQMR#awuxrPuM7*M$wV1i0a#63)M4>2St;-HFRK=4gN?F@lE!(zi zQ#C~^I&V|;`K*0It*&Z|QF~I=`h*s-u?|+m%Aj@UvLm%sbTd97lv3BcL*0?-bR|JVqL1mPe0nd81b{6 zy_zELV!K(XrIqlrJ-u4I8x6bB(8e)y(0lJ=y}_Z{VZNVY&1~PQ9hCjDtH`hKl_!e! zl=Pm;c;b`M!0jMU@L4STBaeKo=&o!nZg2PyMSMQ!`|wY`OP{~)%XkCRX82~n49~|i z4jzo&kk1s4=^x7P6>oOE244Qw!~aoSr0Zv;v$R1!C)-Lpa(jt>fIyF&D*eK~yFfGE z_>I`)Y?!fNoPUz)RO?R@_89^Xh$;VZZ<8}|{+8@3tJJT^OJ&!pBgO%MH0%9?M_0Cmr$b zY_%QLQK2)NUrRc$Kn~^*j*yOJbHrg8RAw%#jQMsjAG0%^6@=_rNlw!lB5S1ucd$aN zMlmZ zxXFmKUX%}`zNJ{59Wk`a8@WN48*^xT~78WUY0 zNWc%!G>`2>xiIcvyC{^PTrYdWC0u=-bvf3B0UehCrnY{v&jSxU;ANYvU2Ng%7 z#j)ObK3rucJAl58GM*UBFSCOwWc|dGTI*;{%!+>fmujSkC}~i_pj5$zU{Ticl$eNo zqGXXGnoE?<*bv)~pHg0ibOj@y1IT!%0_ODWWFmYg_^1|yNBsnzBLIs9fCWF&9QZ!c zJd*hvOW!~pf0Mv(5FpPvCco^jaVhtCOa6O*h5Q@;7Fk%;qA$yxRV`W9lv7oE+xhEY z5A~?e&-y|$>htqH{zVe?SpXx4@%e@M8|LH=`F_=6(*kH2do&o&IL`!lAjZ!U=RYS< zPoRjvIeDu((e!DxTw>jZNyes25{ zfB!JN`E@-74^WB2vip!H*}&S8jiP~RqUr^5>LxQcl9$dGO{;pcQaIASRV`g0Dp7)? zM+!xw#ik2rESUN2_LN^zAFQ>tmulQo?MkXIW2|tZK4(NT`>kW4#-9@)fpQSwP|7P@F%wb70Sro8*qV=K0d+KEMYtkiqhw64;P=sQ7098JEe!foOss zB}yKFd_|s~oexh3GwvI4<&PPsdTbuG-IC$n;C~JukRe>)Xa_Ed3ZWu z^w?B+8ewMrLmrib-x2&0e3>f?}&JV~cYBRg}lRV(0s4W~sn`D8JdV zfAGU+O20bzlRNkNIPFVX^^7CH0x`oKj)%=jwEJyheTBfM0775?gFpN8AB(>@5Gwks zj^|(h%=H8O9jg1D>~B4s`XSKPT+5zYh}=lv2Lzf4Gy!B>haf-jH2;rOa7O2E1vxLE z+W!QA*2>=UXBVzd&M%nZ7;=lHJSXX#?1kebs_}jTIHYQv>>=Zu3Z@d~<2Hsn$|s}Y zg!z%f1pg7lynz;pVIZ2T5V6{a4Op@II&5B@qIac=-GwTKQ>{&-Xuc7vRIMy(sjbik zxiGy}8;_Hk&&cQ6>TGSr@{4VEQo_T0er(_tZHWgKo_Jt7SOEnoY`rUi7mCVk(DJY% z1Uf(Xiv8Ms7oH_}Iq4ReogI4sxNBsIvp7syG&Kb{C;g?{bem~ zyQD#dDyXp8)PYqq*P=bS1*&fnINHqEEcH>|EVg8E;={f#1;!enwTSaumDRorwmKD} zYeab|PCZpxvX&BCBtPwA^`e$F;B>V$o~OFPISLZCnof~J`A|D1z1T&y`+hjdZE znn?xLQmR#H7-F4S$XZ36!W^tkr0O>4vkevYWW5rM9VPGv)|qUCLfb`yXj~r$>Ar(G zQQxF^YgrfYX6no=N*B`sPE?d?)kuP(2D4~G!gR21(UR?}M7V`#DRgiJU)T++*`88ns zfnkZPuCRDpvWw?ny-WG3z1^x)3pcJ|{gn|h=Y8**nXK6PjwL5xJ0{kiom4w-Vy^pM z>|enN2waW+fLiS~sCQ7*i<%AU^~v`-dOEPQEB3^zsPCs`jTZttgkgbhDPrACBs)PL zkfohnb_Y8uhdMh_4t9)Mu)$>al8aT&=_?NY?41G3@n)EKSGo$f7P!?|)ze$Z*sqMA&ovO-JlGAra? zb+)ElVxQQ9iQQ`Tpld6R>+j&218yevs;$d@u^UEbgKmEZ&FTaeAlVx)HH#wkqIZp= zb|}4Q`)RWw-6(pqx{hqrbxqIY&Lsys1&L06Zj0!FaA&${Rih$1eLn|7JVVoh=UMsg z_Ua?&*yytTe%?}Hy4h@rv_{%i_4|dUP2^KlS#c&Fu(9NUxChbVJS^-0+FiiD{lMBn z;ed8g`gSy@)_l;(|5j{cmtb`o4)Prq9@qZspWC3 zxL`J(u;^-2>;gY_T19xzU&2S5!s1kB}S4-*ux*tMQCWf*G8uRqrJl*@F2L4V;sjt#iH%1EjgS#A&y7fEQEPJsW1wPBVs7h$7W%G{yA;M%}vM; zE5nW?PsP1!waHVaBgN*ZnbkGy6nC;J^7aciEcdjiNSxkua5k#>_J$K)%#doj)vwBWE$%HA;PIfc#j^R7#xKFfUKcNBobo z2KyWa;pK;+)bq4lB0o8d-)YQO4*!>GzG%A613SG$*oAoIhKV~59CS5_7pxT<+qh)3 zn6wE0KPs7nOEz&nYp^HCU}3(wi{9igR_D2$)v0rzlICHi?CS>*MTRwhJT_aK#MJ2}miDg&vGK9V;E=P>?mDQ}Tg0htL;gXF$3^(Xj9b8?cZdxLp zk?rXniqaCvjf|%!2*1;S#6E!_Gnu|da#wXLortPRD*zc6wbXh>sYx2^zIR_3)@bfr z%0~1&ZZiYo@=^hNiM>q6WC0gLPNazcPhf3%yzXP`OFxc^_)w`^@d2rFS+rZ7t#|+N zV|K80wXg@gg5Ool&nKTqL_Q>2CzScUJ9*}m=py4sXQkQB9PCp>vHB6i>1olGLOvTu z%5kcclh1wGG?2%CA;u$&Rk9TvP$uFH=qAWs!ASGDLyV_qa6=I?;nJw{pgarjoQVIM zZ0mfEO+I!YPTIu6Vuq5FVp8SWd_FA3#Uys3mh6FAKk#(i3o{*TOTe8-Mlfrm}nAkWTbR#ga|5Qvo-i-)rVtxErc7#6e1CJCF z>}A$W_Fyexe6LEVwiO@mw(4nSKEhLqnpd+eQ|nDZe2rEI&rhqM<;|WIyw=&ljEM0? z9fI#Ed&aA=^c?fUrrpW$cr9utRP7g4?Zo3+YiY9EumUdB07$`%85KP#F@ZaxT=tn| zip(+5W0jX&i%wYgn`T`2mt2;uV-xWJN%0z>edHdc((m>7G+G#ImdCP#T35#x7cXrlMwtB4(`N&~x1hvO~gR zA^?aPvZMd!@UDqHDK)&qFwQ50SwsAYFg4tbwCvFLkPI676Lm`B+EkbhGx3pnvdx!Y zNM&`U%@tXfX?`r*6I?3=*S~?d(UgrFH0Kj^SWxZJ4Qx$GTaQ?8DT{SbhDsxwVP5Fs zQf0E#S}Mzuk=Y|RRO$?gklI)k@uy+*Ya6Q&HZcyC79q={)Q;yg+6;&g?RikQDpXnR z>@CXY*Wxwu))$cKklw&^88@JYD5w+{F-d;*#bw-`nKxx!Z)49|-dN8S5c``SmubXl z;Z^tKUwU|_*JewpyFxczO%z1~Q@B{df0}`6kR?#(%7=0cAf9>O&*yc|@I}(V2_WA-_5g3xjm8$@{~*a3}qT!_7Ok z5}|>>?^B}|!s-C<8vP0ptP9%tsYsCCmj?glVfl$)>Sd3|sfI{)&0Vr)WJ&)o^0!9b z(F-4*Kb2=IQ!}9|6`Pt9kGy&2NcXLb1GjlWPWQ6>YXsEW7rKhJZnqukvhL5R{4Rm3 z^50mg@y9sqB7NTL3AEE|*M92x^-Z{tvW-#NL2}g0iuKr#? z>%YJw#1@tlXWN=+@;L5IAR@qd;I=iT8KV9L$*~8bQC=y(de-l!L1f&C;4}}K2_7Xi zZV=y1`N3JgYo5>r`RlV~)q_;^Gb*bq{9`1?5et~J{I_JoxsJm%B&Y~dZt(czhF4AM zmb8nwbKu~UxM05#RIc~Yw*WV!Jh$B-gUHwuCd=eb;ZG5h>PxXJZhG);U> zzJ0OY?ZaC{kM5A)z4&xKf_0wm(tS1;zVi5`y?Uj*d+8ay{NX=bYS-<$$M$fWQLp2V z4X3XJ_k5p}QXyNVqc^BX5gYf^YMyAuT0NFis!R9^;y5e++m))^uhTG;fS<_1s}=q< z%B>D4zGyt{|X*s3R=PQ#f)nq z5NCk|f17&RBCm}fbn`D0o_YA2J=z%bR99zS#2oSlz%(rd3y7Dfnx|DT}x zlk&o~6RxaxfB9Oy{L^bKsq<8GmB34++nWT&srcCLuc4U9ot}?IDQM2e6FdM4-mFo+ znKW<25r(FyNr*t0K!m{a1XR3N5ktiedOyJv07i`&j3?$L4^&rKFBPcf26g`-0a_3K sHTmU0lm0K{`+BE$H(P-jrs21D>4!3#%0M_5c6? delta 10021 zcma(%3s79wb??3Xuq?aFN5Hax_ELQmqeaF>PE?gDoovW4&S zN?)Lt9^SbspL+PE z|B3Dm?Nj&35|KzzaDS*-Z4Mn!4~TX#v=6EWLxQRAL0N$Z?2)q*TPPH?1Og$z`Q5d)i2wiGds9Wt80@cv& zQF{bl19-36EAU#tkE_Q+eQKX*?+EsX2Gjw8?+l&@4XT3z-xWL=8d8Tsr_@uS)9PtK zs|%h94XeYU5p_hg<=|QMtS=)LJ+~pLqrTDDc~ZY2%aU(2pvEpB3|tiLjKEl6JTMWP z^d+!>7wKSjL$D-Nrj~`K)Tz+4IvpY^3Hel?;O!1(hjyvELNn@&Xg7kaU-gG7)QV84 zS{j;FXG3%9T&O&%mWN<0A+#sx4sBPr1KvdT-jLKwFo+xsszLG0R5Qghq=w+R59SzF z!=ZU~Ui94$?FDr~%zIH?6tw0b4Mo(5z};X_y;?8u13@jcq%Hw|kaQh_fi9E7qy?V$ zkRzlOp7)YA(hkon7Thf0 zo)%melL89aO|}s?Os8?p99=O)aQJk#jqFM4EL1OkeNNInS_ZRfX)6jS`V6y@rb~!D zP4<#~enq#6+3sgnn0bEK%&Y(%$uyQv_KF_w+&w(=dq{JOB*+KAk+Th}n1>u{k-~eR z<38x9(0n0r80^m7k`Vo1BU;D~a%95@zDqkfS~_34=;r#X6Fd;h3OjdMDNEDdji3eDCE37=EO{x#&~N^o~-h7 zIg|Mx1AgQ>0LkW3N|by#L;SkrZ2tY62g}od(Sssihv?FzfE-gcBw5l-OkR;>+5`dO(1<@$KngLk{5J5VRKY9W z61I!sk|K@d2-DMqT;-M=`konLZ*vf*5ryC;JIO9mN9suf+3mOME@q9!tQyz^vuM<8 zL=h3^f{w5v%*N8l9@3PEJu672vJ`arM9euM1XdZ+?PNbtjX-h$^<_Z?vp{b%qzUxw zL*%gEVrT}B-P9440*>25!-A)SyUH{UT^*zu+8JRpIf5Ap+EvhYYR;I8SvF)n6XG>< zMLs5tJ)vi58Kjk^i&T;>22Ww>q)oH2EMjuQW9pT9nWIr5Qe%j0S>=R?2_|ON zk*ws7S^gn0%M8+amyAdp>+<3s#0nh4YJ)X~94CFG-=C7$wryG_XD1?a%f5F@fc5Tf1nmeW5nM$e$$ZLFS@cQ3=_e38gWzcd7)#wEtoT!& z;C;^tu}v?`1xU5g7SJB zkL9}|p!f^H*J#Gs6!g0%xmwH2V2uJuLf^r0z2u|_y1(B-n0~)u7=7}#jF*Q%F z(0@RlhfipP9Mvndw3vgWG7rn&FvXmj zYeSBuLp7>ora%TMXO*OaR6;VyW9C@qh7`+UdEjXf{()_*!l-6zrdW1@YDto@FiV1N zAvt;qvFcTv{X%>y|Un=3+Qb=4U3o?r06aL=#BI( z1WDxvn?D8+w`z3G8;sCnNO2&=7Mu%57QEBGxYZlcQ0)tb;Cyf{qS2pVBV{Qp0x@pE zhiR6Y`ESZP>K6qMETD7K5j8C`Jx_gb_JRRw>*D5!FE~SA0ulOi1V7^w<$1Eg?=AnR z^$jHaf;%ft+TjchNF?o&mA;~-2B1y`EU>3`;()RyiW>i7RY`PR4(BCA-5~lN7Sw=pr{IPHu47t* zvK4+WA)9x}3dy`sNMq$oQY)lwBo%@!HT>m-yhq3n2zjiYoAMJ-T<_@ckwk}l(cxav z!Iad&khiYMpnNhRTOedVD`c^(Zz=~_{?TfW{3?I8`n75fYxDL$eHVY;YpYe_N|4eo z0B!hL6NMK@Ni^t>c&z4dXRRDJdxL^z#VT5j+II|D~_|>$ zpC0jN{(IfECm&xtLZ3(8_jp-tNAyR4)}|T0T!Z9F1V2Pjji3rZ+3gAf)jPwE`Su-H_y5OLdC4SA2f%D|NKic7$D>AYRpAzQgWUL7KZ1S^} z3`JI<|AKC@1D1oCK9P2=?7_cJ9%EiJN9|EZokx+|SZZ^E%MDr!RM6(Cd0*;uIl(G_1 z25(rUVTTwB-mXAJDI8N%@o)+y#C^#O0rn!P)XTz!nu}DT38X4p%9mh{Wm-C^W@Q3X zNDV7XGN%^kCcRwfg>xa`6=atltptneSOu%xng`lmPb?r%C4`Ge1K`y-n3fUCgwj^U z+}TnA)GSsE%c$rzpkp_yNeq_7QkZQuZN+ratf%O=?d0eUNv{Wb2k)$JG%2K$-&0=~Rmd^yLAvxkt5%YGNnSV8J zbc|fLr0>(*%p%SX^rFJ+8j7Q4c8I~DI#z0oAWQ4!#Ys4oLKv|F;v{jH?FXk)z_!nV zWo7Iz*wGTs)-nR+tYwoT4h${Ow@<5xRkD_Zt$lym)>Sj<2aOhLEn-ce+!CwWkg!Px zZl=N%PvEk^bCAE=P|yh%DQl*ij@67406C=}3ELnW zoQ8!s0{zY~D?75eQScr$%zJiMM>m7a0`KcP$q0Cr1n=Z^2JTmGN{|aufSN|m>g`%n zU~g<6yfK`Ex2~gNd(2?rzX{V#h>=mf9p3C~^>%h7wjV0yd6-fP&{e%dYi38a1FW4~ zVA-sLSjfe-lyNXkib-&A>I4(Vkg;XrcKsMD*Sp~LWv`Y(s~w1>5VRLa z8SBPag5=c${l{51>)GrN1{~ri8@r;1L3=N7Cs?oHAAnC9qq5FCiFgYVKDkVwPYXyN z2Q!Z|_lj~+(fjm%c06DqQ?QH!0>hkkg7pTDkZH(J?n@F(umW@v;kTvwVA#IN)dw{t zN_^PO7#bY1H4%HQECtLy$=vz?nbA*(Y}Q8nWENIt4$6H$0ac(jzy^)JhWi5L5<9^L zH>at!Gw=+(Uu%pVW&ME;Hi&1;RXZ5;?}NXSuy(0*R;(Q^-3VYOR}niK&P!Uv5x{|t3)-!p{lD9X8~gt?N$kf6 z?uV&Q>*8P8ThKhTAq7sc0dNMkra+)O0z26rSXmMHNkNY(AW4>|@5|W=YCBzEMeBA- zjBp01dnCAsI|m+W;4IN(2cqX=DmIoYp!!~bOmQK05fyL9 zMqOa}c}5mdlj_5EUEK<>6{Z>nERQC#avFV38)N5J)5#6;0KR!b#V|o#+Y9^;fg7dK zZ{OzUCJAKUCiAcE&w=vx;O5)m)^LA%jtbc_1I}d^;O`==(CQtvhJW$%MNXv zHR8I%l(J=*yojQkNHNA?eX$AF7=;=%24%t`%7lZyoJidcy3WP|lW=SLVR#uB4_Cl2 zUh)?(GY5U1jm4(mC4X8=f!^a#RK{Wi!fZVJjYMxJ&C`5v+=419>S(V%!6rm~bkO(N z7@L68p`kk}Z>cE}#d7=(c7iJk=lHr~@w;JihvKtaik(fN01OwbKoo70J| zJ81TanjYbeC}7F*xw};VT5Zcj{x|65x&t z?ttKq-{u-ylQ=r1(+V*G1@v6kW?55|jl(rd8hLmP)9f5;GTN)w6$^Ot(^`OeR>3>a zx51!~L8Y67Cx_FVi@pcgBS7)O^HF$CC3(y=n~GgxP2e9yVAFv4Ag2cz0c=tWF~WRo zl8sG*avZ8Exw8<&&M4T#ra>?vr-h+C1{vb9H82TYub)rMC!Li2z`q7`A-Xv@0KRx8E-VCH^ zV`g{h5VJ%YPd}U!g&8LE!K=ZF%^7QoHh&i%U2w}HJo=yyX$yZ|YCU~%2C_p{B0J#9 zjuUGX7s?n0TuP|hN=#1*$24Poi9s!b>EjjO9d#%{?F*a?EQ;dmXW*uACd#BG;M$vK zB0roz6!LNAPb79M$S0vs2kY|Vc?FnH83b|YHYlIiOv6xyJ_&^jiWSI=z+Iur^4K|8 zBR6?^%?iCK_qS9wZB_$&AW;B7>ob3%{t{CE4uK9pb;7OB zQrJ7;TbQ4ojTjvpQG5@ACIox=c>g~6G=HSOdUpvD$`O1SRa7HZ3IM(^t4P4RpLpv0 zRKSNH{e1CTzv};;oO65TV$$eYa3%V?6RYxdt`9yhyKe0`=}>aS+6bl)6RTw9{iizX zBJj;{F65&fI0jy$8n?4%apn3i*nSZK-o(OZ@>fokjH)VxkGKI+xra(8Af2sg!a2I=4q3v>>}W_i=;9Z_6L3je79D!i0> zuNVvbI#T08-Wv>3H;#)pZE-X4YTmf*=*aNN;gP{EiW7^cM10fK2e-F#D5qiHB^2I9 zeG#8FL3}gbrJy$9r{0BG8b-Fbsm=myd)&O>)n?=7!2Dd8c4PlvqQF}Sy14mFT?brf zNbyY2zeK|mN#9Sz4dHH(i>Kl!m%YIypNQ4FZ@(9LEe8kBMc~Ay3qb|~;gG@&g$G{c|9z&ydJs8#dDU>4g>HjRd}z2XZ9fv55gg^K z!^QcB5Z#Ah48b@8af8~)zc$=ay%SAxUV>eUh}*~caD>(&&3f5OC&&!lh1TrgJ4VW_ zBD#+AGb3d?;D2Kz)irgBD4}!P4Swu<8lLk z@a&Xz7726wH)jJUjN5$d`ZN$#6WqPh`>_fC=Od|hvC!i40qsG8W9wcPZ;GQR2>-|; z(FlUe{IlnF<2DfbZ^}D;iRSaS&yC2}_>s|?TntS$RVYU1Hj)eF_l*`> zUxlub82{vGIe&HZ9*bgz|1{wP=MTs({=oVBBet8L=vEn%yWi#D9A+zxqWqS}1rEh;U0jad%RF3M6uQMQ=&g-jK=y~gkhnPn!AUP-`?d#9Kr@~; zJs%8WW<;l*1PXiv5?^v41w*d!Vi!y}#((RGgmV>}BE`Rjn22+{Tc;F&TCDlPk$E~3 woafOgcfN5ij@(6Dkv90xf4={6{{M_WzwwWaz|Dq+I)a~#kw4%2@W!tK&uXQ)I^&v_q`>mc~LVWxF8IHRsw~_DAC)wFIStZGwWFt+F7jZ(j%&$3yKcbi4+m; zXMPB`~(gRWC>99C^Ss%)stY7cq z?gATtPUx!ojgFX&5Ghl=+ft6RK~$E}m7xgHdvG4cxfkbpoLg`fv7NDE`vaOCv0`gs zqv%EpZ-QRvffk5B1H`~zta(e)`}BTim&iy#(wcd&BxzpgV`CA5<9LK@OZtFX=g1W0 zySy3tz>Sf|?ad`AKZ)ihggLbdrAZsNV@7!@LXho4GW0~eUV2rug0=sbN(uZNhM)eWY zUxcuuKd?&f$oe>93GRaNA~ieN#WH0}5n^kUT>EPqyerpj%50%5%=Vz%Ys)}#eSp-^ zhFKUuW0wxS`UH|)hKXFiIZ$>;4*Ib;mwAN8%|Wq(&JD2@E=V8_yLnG&5CPK_{n;so;@Bf&6Q(OZuE81V5gK4&MUqZM3gl$(ZW87zP+G!%na9l!`ILDU<{eL85uR2YMK>1kcLC}!PEX0B%^mppG>nRp zxJA@fig^(EVBZpmom<2^s`xdlGBJN?$x8~i$#t?xfS5Bdislx%1d_PukoYMyvv1#b zq4XXs=4RPCd-ag&3AG>HImceZXgt~zSTxUL#m#YX9VApAKF^m4O22n@d$j;x;2x1n zE1OsKi*V7(g-81W3X5}T^H1tMi*XYGq}Ei2T-HMU%KA< zG8C(zy;=pgpkKI&8158J85G(Spr5V?f&U7RmhYQ{JUPPnjkjL(9xot-WfWzu5=2M(8gBK-9xu^mnf%h=1}Ek7I$!3x=x1E$A4`ZlL{xn zo#Z`bXG>92yG+?i?x88vn-q zOZ=BR=rleJg^#DB2}+ra3Z?U|*rztF(;@Mp_KQ6za@V7NpC_Tz@^w-*$RGFOAcs89( zVZ@GdQhPjpe5*4ip-cJhS6WW$vY3lJPhvh!TzRf(dE>M0QMzq>+1*XK(I8Jwyn{ZS zcv8t!Mmh1J-L%3}8BMp){N7HMpGfN7iC05|yAQO-#=iWq5$X94Ju=#Qzeh912fd>- zXZ)_Wk3KM-_5Ozb!uY7~=k$k0qJLxT2}PQEpX8%{>+7T9>$iRZOLpT^ls4fQFn-(r zvCHG8#?65tPqXBuJ_81RNuL=b@mT#QNTfX#!_SP@<4g2Yo_f9a0l*)J!c~nybmFPJ~3b;&ozj=D&-<+g(Mg>mTJadsLO?3}n25u^7?q78K+U3yTEZ;GN}3S^OoC#rt0oJ( z)DmYEPLVJ(;@It`_=!MHQw%^|HfU0$z$ZkH(G9&cC8)a=Gy<_sQMAI1)3$D}$do+u zMiQ~~MVXPpn#J0KwFzqotIDi@*H`BHJAfOoa|?+<_AsDqMJK3{v%sJ?^0R} zMH%RDlx=H9rqJl}9Q-ZWN@=P?l6?+(S=$c)c|;)>TEUGvawk3TV^LPt^m*z7%D9aS z^lcBY+LWy53Y9Ym0Wd*5l%mj<$E}bi&Y&G#)hdWdG4&ENve2Bi)8SUI!mKtKs&Fez zSyZQ=s>8?2yRXR@e?70E1CVbs*g#D%yE=g5pEI9jD~w-x=wsGfWj zpa$}mds&j~@OaSQ;P2{s%+(9e+N>#V0 zPDY7~wFooL$RKIz(B9Oa(u_%CYMU9;02`t+6(vnVI%AxrX=@lx&eBwrStAZSYFKw$ zh|B8u)3iGF#w<;WTqt_R#A zE##-gn)pufkQ=!|FWk+D2NA7Te}WdQvly^h8G(7O;K%Q|3Iq3H&HO_-FFmSOcC{`W zi?nFx>eVe;pv5S2zz=y1C28lK`=8w+oNTEYsj7mL3IcE~=`F9KJ)9<($`g8%Mhbbqnh``GT&mEard(^bDPk+VP zSC#piP8d7i{krABbsJe(ejpcUj8fteaRtalhBA6^cTGXzQ8d7(e|n!nbwB(~Qgpww z*8Qv0>xAhZEloV8<^kbcQ!CYr2Glu-)sMB+IW@6zy?)P)tsYtDXo7O2kzYJ+{v?>m zUWe)uOm5&Bt=Ins?g>cFV|^0qa&?T%Pl98cPGeT&GB8$X36ECtB7hbAUJx1a)FBcr z^TKaje||lA(%)enbQi1sDX46r{i8BJxC(za;cra_&3s$x!mS895tm=ZMN{HwTk6I} z7oGUjTAfB?g_f=JG=Yv?F+XaH@Pzr#wsfW%cfz{h?7ghfMKws6|0R`;tv?!y?!d0P zhj9dQH7lvf)%$Jj38-n8a8^_+`VRFyTA9`jy%>1d&fR~ zy&#_dzW$>5^VFTD#-070!oFNDY;2k{Jzw%eb29xyzHR{IT}@zJ5^&FDY_ak)J8@pdt?| Naz&BP%#Q}&{tCzI#gPC2 diff --git a/deep_autoviml/modeling/create_model.py b/deep_autoviml/modeling/create_model.py index 42de2c0..eb1d59b 100644 --- a/deep_autoviml/modeling/create_model.py +++ b/deep_autoviml/modeling/create_model.py @@ -228,7 +228,7 @@ def create_model(use_my_model, nlp_inputs, meta_inputs, meta_outputs, nlp_output fast_models2 = ['deep_and_cross', 'deep_cross', 'deep cross', 'fast2'] nlp_models = ['bert', 'use', 'text', 'mixed_nlp'] #### The Deep and Wide Model is a bit more complicated. So it needs some changes in inputs! ###### - prebuilt_models = ['basic', 'simple', 'default','dnn','reg_dnn', + prebuilt_models = ['basic', 'simple', 'default','dnn','reg_dnn', 'deep', 'big deep', 'dnn_drop', 'big_deep', 'giant_deep', 'giant deep', 'cnn1', 'cnn','cnn2'] ###### Just do a simple check for auto models here #################### @@ -269,10 +269,10 @@ def create_model(use_my_model, nlp_inputs, meta_inputs, meta_outputs, nlp_output elif keras_model_type.lower() in ['dnn', 'simple_dnn']: ########## Now that we have setup the layers correctly, we can build some more hidden layers model_body = dnn.model - elif keras_model_type.lower() in ['dnn_drop', 'big_deep']: + elif keras_model_type.lower() in ['dnn_drop', 'big_deep', 'big deep']: #################################################### model_body = dnn_drop.model - elif keras_model_type.lower() in ['giant', 'giant_deep']: + elif keras_model_type.lower() in ['giant', 'giant_deep', 'giant deep']: #################################################### model_body = giant_deep.model elif keras_model_type.lower() in ['cnn', 'cnn1','cnn2']: @@ -442,6 +442,7 @@ def create_model(use_my_model, nlp_inputs, meta_inputs, meta_outputs, nlp_output #### This final outputs is the one that is taken into final dense layer and compiled print(' %s model loaded successfully. Now compiling model...' %keras_model_type) ############# You need to compile the non-auto models here ############### + model_body = get_compiled_model(all_inputs, model_body, output_activation, num_predicts, modeltype, optimizer, val_loss, val_metrics, cols_len, targets) print(' %s model loaded and compiled successfully...' %keras_model_type) diff --git a/deep_autoviml/modeling/train_custom_model.py b/deep_autoviml/modeling/train_custom_model.py index 6a2361b..a36f9a1 100644 --- a/deep_autoviml/modeling/train_custom_model.py +++ b/deep_autoviml/modeling/train_custom_model.py @@ -52,6 +52,7 @@ def set_seed(seed=31415): from tensorflow.keras.layers import BatchNormalization from tensorflow.keras.optimizers import SGD from tensorflow.keras import regularizers +from tensorflow.keras.layers import LeakyReLU ##################################################################################### from deep_autoviml.modeling.create_model import return_optimizer from deep_autoviml.utilities.utilities import get_model_defaults, get_compiled_model @@ -150,17 +151,17 @@ def build_model_optuna(trial, inputs, meta_outputs, output_activation, num_predi #K.clear_session() #reset_keras() #tf.keras.backend.reset_uids() - - n_layers = trial.suggest_int("n_layers", 1, 4) + ### Keep the number of layers slightly higher to increase model complexity ## + n_layers = trial.suggest_int("n_layers", 2, 8) #num_hidden = trial.suggest_categorical("n_units", [32, 48, 64, 96, 128]) num_hidden = trial.suggest_categorical("n_units", [50, 100, 150, 200, 250, 300, 350, 400, 450, 500]) #weight_decay = trial.suggest_float("weight_decay", 1e-8, 1e-3, log=True) - weight_decay = trial.suggest_float("weight_decay", 1e-8, 1e-7,1e-6, 1e-5,1e-4, 1e-3,1e-2, 1e-1) + weight_decay = trial.suggest_float("weight_decay", 1e-8, 1e-7,1e-6, 1e-5,1e-4, 1e-3) use_bias = trial.suggest_categorical("use_bias", [True, False]) batch_norm = trial.suggest_categorical("batch_norm", [True, False]) add_noise = trial.suggest_categorical("add_noise", [True, False]) - dropout = trial.suggest_float("dropout", 0, 0.5) - activation_fn = trial.suggest_categorical("activation", ['relu', 'tanh', 'elu', 'selu']) + dropout = trial.suggest_float("dropout", 0.5, 0.9) + activation_fn = trial.suggest_categorical("activation", ['relu', 'elu', 'selu']) kernel_initializer = trial.suggest_categorical("kernel_initializer", ['glorot_uniform','he_normal','lecun_normal','he_uniform']) kernel_size = num_hidden @@ -183,7 +184,7 @@ def build_model_optuna(trial, inputs, meta_outputs, output_activation, num_predi model.add(BatchNormalization(name="opt_batchnorm_"+str(i))) if add_noise: - model.add(GaussianNoise(trial.suggest_float("adam_learning_rate", 1e-5, 1e-1, log=True))) + model.add(GaussianNoise(trial.suggest_float("adam_learning_rate", 1e-7, 1e-3, log=True))) model.add(Dropout(dropout, name="opt_drop_"+str(i))) @@ -198,13 +199,13 @@ def build_model_optuna(trial, inputs, meta_outputs, output_activation, num_predi else: optimizer_selected = trial.suggest_categorical("optimizer", optimizer_options) if optimizer_selected == "Adam": - kwargs["learning_rate"] = trial.suggest_float("adam_learning_rate", 1e-5, 1e-1, log=True) + kwargs["learning_rate"] = trial.suggest_float("adam_learning_rate", 1e-7, 1e-3, log=True) kwargs["epsilon"] = trial.suggest_float( "adam_epsilon", 1e-14, 1e-4, log=True ) elif optimizer_selected == "SGD": kwargs["learning_rate"] = trial.suggest_float( - "sgd_opt_learning_rate", 1e-5, 1e-2, log=True + "sgd_opt_learning_rate", 1e-7, 1e-3, log=True ) kwargs["momentum"] = trial.suggest_float("sgd_opt_momentum", 0.8, 0.95) @@ -224,27 +225,27 @@ def build_model_optuna(trial, inputs, meta_outputs, output_activation, num_predi def build_model_storm(hp, *args): #### Before every sequential model definition you need to clear the Keras backend ## keras.backend.clear_session() - + ###### we need to use the batch_size in a few small sizes #### if len(args) == 2: batch_limit, batch_nums = args[0], args[1] - batch_size = hp.Param('batch_size', [32, 48, 64, 96, 128, 256], + batch_size = hp.Param('batch_size', [32, 64, 128, 256, 512, 1024, 2048], ordered=True) elif len(args) == 1: batch_size = args[0] - hp.Param('batch_size', [batch_size]) + batch_size = hp.Param('batch_size', [batch_size]) else: - hp.Param('batch_size', [32]) + batch_size = hp.Param('batch_size', [64]) num_layers = hp.Param('num_layers', [1, 2, 3], ordered=True) ##### Now let us build the model body ############### model_body = Sequential([]) # example of model-wide unordered categorical parameter - activation_fn = hp.Param('activation', ['tanh','relu', 'selu', 'elu']) + activation_fn = hp.Param('activation', ['relu', 'selu', 'elu']) use_bias = hp.Param('use_bias', [True, False]) #weight_decay = hp.Param("weight_decay", np.logspace(-8, -3)) - weight_decay = hp.Param("weight_decay", [1e-8, 1e-7,1e-6, 1e-5,1e-4, 1e-3,1e-2, 1e-1]) + weight_decay = hp.Param("weight_decay", [1e-8, 1e-7,1e-6, 1e-5,1e-4]) batch_norm = hp.Param("batch_norm", [True, False]) kernel_initializer = hp.Param("kernel_initializer", @@ -275,14 +276,14 @@ def build_model_storm(hp, *args): # this param will not affect the configuration hash, if this block of code isn't executed # this is to ensure we do not test configurations that are functionally the same # but have different values for unused parameters - model_body.add(Dropout(hp.Param('dropout_value', [0.1, 0.2, 0.3, 0.4, 0.5], ordered=True), + model_body.add(Dropout(hp.Param('dropout_value', [0.5, 0.6, 0.7, 0.8, 0.9], ordered=True), name="dropout_0")) kernel_size = hp.values['kernel_size_' + str(0)] if dropout_flag: dropout_value = hp.values['dropout_value'] else: - dropout_value = 0.00 + dropout_value = 0.5 batch_norm_flag = hp.values['use_batch_norm'] # example of inline ordered parameter num_copy = copy.deepcopy(num_layers) @@ -367,10 +368,12 @@ def run_trial(self, trial, *args): save_model_architecture(comp_model, project_name, keras_model_type, cat_vocab_dict, model_options, chart_name="model_before") #print(' Custom model compiled successfully. Training model next...') + batch_numbers = [32, 64, 128, 256, 512, 1024, 2048, 4096] shuffle_size = 1000 - batch_sizes = np.linspace(8, batch_limit,batch_nums).astype(int).tolist() - batch_size = hp.Param('batch_size', batch_sizes, ordered=True) - #print('storm batch size = %s' %batch_size) + batch_sizes = batch_numbers[:batch_nums] + #print('storm batch sizes = %s' %batch_sizes) + batch_size = np.random.choice(batch_sizes) + #print(' selected batch size = %s' %batch_size) train_ds = train_ds.unbatch().batch(batch_size) train_ds = train_ds.shuffle(shuffle_size, reshuffle_each_iteration=False, seed=42).prefetch(batch_size)#.repeat(5) @@ -421,22 +424,23 @@ def return_optimizer_trials(hp, hpq_optimizer): nadam = keras.optimizers.Nadam(lr=0.001, beta_1=0.9, beta_2=0.999) best_optimizer = '' ############################################################################# + lr_list = [1e-2, 1e-3, 1e-4] if hpq_optimizer.lower() in ['adam']: - best_optimizer = tf.keras.optimizers.Adam(lr=hp.Param('init_lr', [1e-2, 1e-3, 1e-4]), + best_optimizer = tf.keras.optimizers.Adam(lr=hp.Param('init_lr', lr_list), epsilon=hp.Param('epsilon', [1e-6, 1e-8, 1e-10, 1e-12, 1e-14], ordered=True)) elif hpq_optimizer.lower() in ['sgd']: - best_optimizer = keras.optimizers.SGD(lr=hp.Param('init_lr', [1e-2, 1e-3, 1e-4]), + best_optimizer = keras.optimizers.SGD(lr=hp.Param('init_lr', lr_list), momentum=0.9) elif hpq_optimizer.lower() in ['nadam']: - best_optimizer = keras.optimizers.Nadam(lr=hp.Param('init_lr', [1e-2, 1e-3, 1e-4]), + best_optimizer = keras.optimizers.Nadam(lr=hp.Param('init_lr', lr_list), beta_1=0.9, beta_2=0.999) elif hpq_optimizer.lower() in ['adamax']: - best_optimizer = keras.optimizers.Adamax(lr=hp.Param('init_lr', [1e-2, 1e-3, 1e-4]), + best_optimizer = keras.optimizers.Adamax(lr=hp.Param('init_lr', lr_list), beta_1=0.9, beta_2=0.999) elif hpq_optimizer.lower() in ['adagrad']: - best_optimizer = keras.optimizers.Adagrad(lr=hp.Param('init_lr', [1e-2, 1e-3, 1e-4])) + best_optimizer = keras.optimizers.Adagrad(lr=hp.Param('init_lr', lr_list)) elif hpq_optimizer.lower() in ['rmsprop']: - best_optimizer = keras.optimizers.RMSprop(lr=hp.Param('init_lr', [1e-2, 1e-3, 1e-4]), + best_optimizer = keras.optimizers.RMSprop(lr=hp.Param('init_lr', lr_list), rho=0.9) elif hpq_optimizer.lower() in ['nesterov']: best_optimizer = keras.optimizers.SGD(lr=0.001, momentum=0.9, nesterov=True) @@ -480,6 +484,10 @@ def train_custom_model(nlp_inputs, meta_inputs, meta_outputs, nlp_outputs, full_ data_size = check_keras_options(keras_options, 'data_size', 10000) batch_size = check_keras_options(keras_options, 'batchsize', 64) class_weights = check_keras_options(keras_options, 'class_weight', {}) + if not isinstance(model_options["label_encode_flag"], str): + if not model_options["label_encode_flag"]: + print(' removing class weights since label_encode_flag is set to False which means classes can be anything.') + class_weights = {} print(' Class weights: %s' %class_weights) num_classes = model_options["num_classes"] num_labels = model_options["num_labels"] @@ -503,7 +511,7 @@ def train_custom_model(nlp_inputs, meta_inputs, meta_outputs, nlp_outputs, full_ if keras_options['lr_scheduler'] in ['expo', 'ExponentialDecay', 'exponentialdecay']: print(' chosen ExponentialDecay learning rate scheduler') expo_steps = (NUMBER_OF_EPOCHS*data_size)//batch_size - learning_rate = keras.optimizers.schedules.ExponentialDecay(0.01, expo_steps, 0.1) + learning_rate = keras.optimizers.schedules.ExponentialDecay(0.0001, expo_steps, 0.1) else: learning_rate = check_keras_options(keras_options, "learning_rate", 5e-2) #### The steps are actually not needed but remove them later.### @@ -542,10 +550,21 @@ def train_custom_model(nlp_inputs, meta_inputs, meta_outputs, nlp_outputs, full_ val_loss, num_predicts, output_activation)) #### just use modeltype for printing that's all ### modeltype = cat_vocab_dict['modeltype'] - ### set some flags for choosing the right model buy here ################### + + ############################################################################ + ### A Regular body does not have separate NLP outputs. #################### + ### However an Irregular body like fast models have separate NLP outputs. ## + ############################################################################ regular_body = True if isinstance(meta_outputs, list): - regular_body = False + if nlp_flag: + if len(nlp_outputs) > 0: + ### This is a true nlp and we need to use nlp inputs ## + regular_body = False + else: + regular_body = True + else: + regular_body = False ############################################################################ ### check the defaults for the following! @@ -584,7 +603,7 @@ def train_custom_model(nlp_inputs, meta_inputs, meta_outputs, nlp_outputs, full_ try: y_test = np.concatenate(list(heldout_ds.map(lambda x,y: y).as_numpy_iterator())) print(' Single-Label: Heldout data shape: %s' %(y_test.shape,)) - max_batch_size = y_test.shape[0] + max_batch_size = int(min(y_test.shape[0], 4096)) except: max_batch_size = 48 pass @@ -644,7 +663,7 @@ def train_custom_model(nlp_inputs, meta_inputs, meta_outputs, nlp_outputs, full_ tune_mode = val_mode if tuner.lower() == "storm": ######## S T O R M T U N E R D E F I N E D H E R E ########### - randomization_factor = 0.25 + randomization_factor = 0.5 tuner = MyTuner(project_dir=trials_saved_path, build_fn=build_model_storm, objective_direction=tune_mode, @@ -657,14 +676,14 @@ def train_custom_model(nlp_inputs, meta_inputs, meta_outputs, nlp_outputs, full_ #### This is where you find best model parameters for keras using SToRM ##### ############################################################################# start_time1 = time.time() - print(' STORM Tuner max_trials = %d, randomization factor = %0.1f' %( + print(' STORM Tuner max_trials = %d, randomization factor = %0.2f' %( max_trials, randomization_factor)) tuner_epochs = 100 ### keep this low so you can run fast tuner_steps = STEPS_PER_EPOCH ## keep this also very low - batch_limit = min(max_batch_size, int(2 * find_batch_size(data_size))) - batch_nums = int(min(5, 0.1 * batch_limit)) + batch_limit = min(max_batch_size, int(5 * find_batch_size(data_size))) + batch_nums = int(min(8, math.log(batch_limit, 3))) print('Max. batch size = %d, number of batch sizes to try: %d' %(batch_limit, batch_nums)) - + #### You have to make sure that inputs are unique, otherwise error #### tuner.search(train_ds, valid_ds, tuner_epochs, tuner_steps, inputs, meta_outputs, cols_len, output_activation, @@ -825,7 +844,7 @@ def objective(trial): print('Model training with best hyperparameters for %d epochs' %NUMBER_OF_EPOCHS) for each_callback in callbacks_list: print(' Callback added: %s' %str(each_callback).split(".")[-1]) - + pdb.set_trace() ############################ M O D E L T R A I N I N G ################## np.random.seed(42) tf.random.set_seed(42) diff --git a/deep_autoviml/modeling/train_model.py b/deep_autoviml/modeling/train_model.py index ad23afa..64218d4 100644 --- a/deep_autoviml/modeling/train_model.py +++ b/deep_autoviml/modeling/train_model.py @@ -120,6 +120,10 @@ def train_model(deep_model, full_ds, target, keras_model_type, keras_options, patience = check_keras_options(keras_options, "patience", 10) optimizer = keras_options['optimizer'] class_weights = check_keras_options(keras_options, "class_weight", {}) + if not isinstance(model_options["label_encode_flag"], str): + if not model_options["label_encode_flag"]: + print(' removing class weights since label_encode_flag is set to False which means classes can be anything.') + class_weights = {} print(' class_weights: %s' %class_weights) cols_len = len([item for sublist in list(var_df.values()) for item in sublist]) print(' original datasize = %s, initial batchsize = %s' %(data_size, batch_size)) diff --git a/deep_autoviml/preprocessing/__pycache__/preprocessing.cpython-38.pyc b/deep_autoviml/preprocessing/__pycache__/preprocessing.cpython-38.pyc index fb3fc8f2039e2049f7ef20a722091e6b1964ce5f..55f9c3d6413d69c439cf77531e508c1781c0badd 100644 GIT binary patch delta 5045 zcmbVPU2Gf25xzT~Na{q1lthX8wN5`)EX$H?S+;E1mTmo6wq?b(f{?i3?9A-^d^b0H{Pp=yQN^|Laz%j8^rhd;{c`bfMU(jPLC=u0`}{b zMir_O1>j5dY9oxoMh&XrYXx+*sFw3(z(-Jo^8w)NP@Pea>iN1{Z!j8BBj+nX*Mypk zX4K5rL4A+Wf?7Bq(p94swKCqpm9S_-ZCoMh?WjZZB==@Jan+_M3R(y7FXVYhtXkh^ zbfGS88`j&6{b)buYxHj806JjwpdP-ig>^6LHTqB=Uq@hl5FO-voqovZNBzcObl4a` z&HxwgbX*TXj-Vr4-k={f2GO8#3?1X^M$jEc$2s2w{17_9`DWlxqLZB8qn|Q{(J<#* z^1(Q-g3W1kn)9t-a|WF;&N}FnQ8a3dp)q3|jdO!GFql9S#yND(m_(CYChF(W z1wf%)zi3RMDdQ5lWK5%JuItcWG%lmd#uap>V0#r^)jF~>n*y5EX0zAuUXJH%@}=x` z4QTQPUppPin@N;>IeQDB=3MeNQ$U!`LJD`#9V3dO#$9yRh@lwQy#Tc^T;|XmU+)83 zmMW_WN-*LmZeWBB4Qa+an&&p#H=zsfe<+{@+=shs@qvdTTEqu&5AFpnfyEi2>(KKm z&s(y1-TjeoUs({E)&rOI1OM|s`p!s0xPKv-IkF&Rj@qU8F!j_5YmhPZWR*>U9$E5j z$mE&rrL{B?0Syi>$r~Pm!FIrXS!s2SFl*4)Z0Qux9p%yhlV#Cb zB0}c0UFEzc&Wrds9(w4fr^mo#P5h?tngFsB+p^3V>cwqKC6M?mI6g_wpBW$t?gshShlhCM0OS|p==N!Zxzvd5zzCJ3eX%a1o(&!WG2iiJD6;s z2k9Yvd0nCdaLXs4T|Z>csIw{HD}37%2-#>iC7YAM-YvZYQ#6! zy@fPd*M#lBJKW^Gv~5ithQr@k?NDc+;2VK&AGr^{_tFmhGI(yqx19Bog6lTgiEpnL z>uRU_Xcz6mch)`mfV32=OAD{5Y*sWDww&1FL)iMSdwv1+Ewp5ppmj96;I zv{PzK9n-Y5nlkkVs*%E)-rL)|deH3;cPij0t!uW1d)=qC17l6oQAE7id@RSr-3 z|G_iBf{eQFMj~!yq|QAMIpoR>7s@1(WECv|aSzvCx|xUZE8oT4mBF1{0LSq8!)feE^QB|Fdh&`JuWK(`0~vXRk%#W%Iw}shzo_qaPt_~2!I{NaT2oW=s%FNa z&9GXyHHp}qu32hw*|OC+4LU?=`Zew%{SdQo3FbU_w;)2vuMk%v!}&-2UzR3?%R7_l{^8rv!%!MOBJFYzF|?3 z7#|QTBw6%JwPKa1z$)a;TX-c!1i2XSD54}tpL(SL=<~k;$uI8k0&-Y`EC4nk?vZJ| zC4R}>Y^m=6Nx>)c#I8>i8KQ�A!#@AyILE)AFkG6Ua911ycNkf$_mL3YYb^D6ls` zFI+yj{BV_035)eYMzIBWD^%FKC-4&Eu(TwRA*x^x=*!GnDrY@#Qp+Hn!X+S)u{Tq; z`VZ{O1Z+83g8fU5M7|EJH|wJ^F5Q$e<+L2ueyp4na9O4Twg9zDlM1a^5bUyK00(e6 zu2?V01nqJhq=Bab4$+FI!do&w5{ik)8zNEcifoW7@Fo(Xp)uk1aHbMGgp!q*{RU_y zu5#8rnJVhdR#E<6kp^iM)Kg7;AgN}5VrVIFm5lq^&&;7++skkj-(B3Z?Pd4wGPe7= zVAhEO`$F2)v@HKNdD9V0>4pIC`2ao-9MvM6_2oFsYBAdia)1@&8`Q^v^_z-aoefhT z)a9pM>Zhgi9yoVuR;LveKC_DnI6tiXvBAbU?{)r6@9KBLRUETp zFjOt>iM#|oK;n!*ALK`2uCjQLu;(ReYxnJ(uv$8^lDMboJ^crBl4&EqJs*w7;)_}? zI6EhVR}PFq8LefIp~@?5WjwR?RFsnmk5fuz3&GQ5}2R{9o9 zZN=)bie7QT=sPrlLisTL2c!AhE#rP#k)M@bygeOnKvI~=HrB<$dEd-lt`!9X$nUogK|kDF)Kfg)m>n+ zlljc{L1m^m-j+>ZGdrmq09Az}eS1@DR}J|VOV1O1#!|2w~>>p!Xc@{!Of2$f=Z#-C_np9D4t3`;K@KN?%c0IgE0vzWnp+1 zkRP+;`P<0$E6E!md)GOwprqEI*axqG?&PhK{j$s6wRj!yBud$*&`BlA%!pue!t983W;@66HplY9=~7sRB;4HI3axUA1j}Qod5s; delta 2764 zcmcguOH5nY8TPq0HV;LI$#tRMelB*R&~>6+>9*5Lh95O+KOAL z2YZBF51bc!1#STB!#;tXfc@A{12`b;Mm0!7I3%zOxG)aW2#yH5No}L;xLshk+CgP3 z3)~DiilYT#AT@?N6>FxebdIzfA!JcH0e6=%G}fxVMtg9Ni1ny38pm;gz3O?|i+gDw z?i02T?0(!&6F4DkKiC)W1%U(VMS2Neq62t<4&p)KYgG%?#6c*M#7RMf)FC>Ihv^6& z5q223Q9LSe1n^}%CU6_zaXc3I7Jumg79^!i*yMu(QEh`y^gO7UzfT}Z{QpBCcY_bq~5}} zl+M!aBMV+pR!VosIicf9X0>!zfi-ilqU-(4>zN0oH$Y(%6&_@;Nr1@RmC_IKL%N37 z=p+1yKE{s)_ZrM#>hT0W5q1y6nk|~MKTgH?D8D!DWY#%-nD>lr0O%qn4QFSY9QCAkZY8UmFoh&e46pl_O-u4<$$OvK_4x^ zj+xy-wTBkv&L23%LbSPEzLprlu&Y#O|iKxt1%Cr>C@Su%s{2gK-R+`23j{GW=Cgs>ovBg)vm3TmWqE1J@_$mrrWJkz#mlnz-b1X0Ato^^ zvls1aR%5_O8iT+K9pE`@>XEg<=_TP1{N<^SkhpnQPv;i=2*jlH088rDqIBrU$13yB z?br`%w2QzD3#O(;3$dh;%C_iDnIOB!E|GbTT&;za2>cb43F+ZXB-4~>BM~^`g#+mb zu_T<|LX|7Ek;Nkf{)MtX6C{gdiOe5JuXsXV_LSO5$E!QawJSSPlZPn#W~JwA(28BN z=pC$|Tqn!q1|$lTn+Ntv8*>b)B+^Kuwl?<>{`KD;M z!FON0cf-bF)&th-ES2fpM3vvQ6nGzi_W>KOdLifO{zKs2F!#?gGeP333B<)svEePa zN5&$sm@QTliI;YfrOcN${_yzc?w?utW6ux!W@Y%~^|YpMXSd{yw61LBwC#EO3dBgl62S3QKcDQO88kkTa2p+puk2(R3}x{}xi3_Mz&DV>c~QYj5+X;m(Gy zgHLadd7NmXrGJ{B{e*ax> zgILLCM)5CV?=*e{_2NiF=3|q8+OvSZ!xQo4<3hX>p&uOo^!zyDpZER=eZ>FJ*Bw$J W%&h4Yl`9A;IBmS6f1I!NhyM%aN&c_^ diff --git a/deep_autoviml/preprocessing/__pycache__/preprocessing_images.cpython-38.pyc b/deep_autoviml/preprocessing/__pycache__/preprocessing_images.cpython-38.pyc index d93de86070820c8f17a951cea117d9e829ef17b1..730f3d404609cfd6e41c6a006dec49e2bbfd8e1e 100644 GIT binary patch delta 281 zcmX>vzfPVnl$V!_0SKn)JW6~jI+0I?F=nIo6vld~9E)7bD9c={D63rSC~HQB6owS( z9GhI*DBE1SC_6BpF~>g2zDgujG|ORuBZ$m$TFA)A5YCXN!pH!EKr!bm7pN?l1yK>n zkjE4OGEpW+HP>E!Q{7H`g!9FV{cHKQ|yM0IWtnME-%F`GEOy zK*hdMzPWx;ez|HYrOw=v!0WDJ>{%CnR)YBM)) z5u<2HYF=@EQCd!Zd3;G~MaeA|5IMP;?>b}D(<4rV4s5k@vfIVKq( JPlb_(2>^v5P4WN$ diff --git a/deep_autoviml/preprocessing/__pycache__/preprocessing_nlp.cpython-38.pyc b/deep_autoviml/preprocessing/__pycache__/preprocessing_nlp.cpython-38.pyc index 6004ebf124686b9f3b9f2cf9ff45f40a6ebc87b1..02c31873ab6781039a4e78a56982b01d8540d8b8 100644 GIT binary patch delta 2097 zcmZ`(U2GIp6rM9PyE{Ak*B{!lg_csV?NAEXs{9u$h=2&iLV+o;PJ4%Tw=>)2&Xfq5 zO^sF5#3kVfl% zyGQS_H|d*7HdveW&1QY!fhj@XVs0rsNES>12`Ti3#MdeGdqo=v-(u+Fg2=y;t8`3U0>We!bt`r|+}d z^>%x|zTZBeAFy>@w+HkARuF3@4&pkn4(W$V^00omBvX2dw2;;(1^rRdMl_TLSwohP z_9sCfBuhyLGDBn;S&ovBwPXdk8$BlJBwffHA@`7bQ4W*)$V!wWWF1*WR%1k(K%bzk zzsB!xryooY7dxdw5PI0v@UD6#EXz0E&LGlb zY-x0$vWj%w!TyMBl1)c@U?;l~9fyxtCbklOXJ=y_xb2x(A9!qWT&vuZ-RjAO@RZ<1 zM}wzi`a1e*m=m|4mi-W~gJs3P;&l@E#fmB!AkAv3KM7{=7HFD%Up<5Tg*gkL&EkyB zq3q+@dIU}K1I~z%Gt162yIHdwleE*`>UuewJ@#s^b5!yWPGl=TrKODuDCG4hv(0AF!(41oDd|8A` zY-Qu7#OmAbP#ow7#b+D00$gBUEog^r?C%9$)rbF{pMHdzCen}Db4_ytm(keH-e|gi z%j2l|m4=hbj@!B8skCcS!yTjaDdg#14tqFkK+uBp6be6b%&^kLaLp8P=*Qf9hOJoG z7%8craQjnsV4*e-J63Qi&=w$yLxBn$^KsBC{2Xk0;V!typm~0xh%tWXh&5)oU7hr8 zwzT=d>WBGig|8M>P@x{^3_ID}7n?ywlju7r=q2`3b898Gs^A29Bz~g`=vCIZ=!M1$ zX!~&d4WI?oN;N>cxz>v?z`j_tY$C_aqX?Qz9j?0^#yLF3S;?1dBS-5{_X7qoa;~P7 zuArRqJm=M8V?)MZ%1NIv={Y3ykZV5X@|$^!TX9}c$IKzTQj8$wZG4R$+)FJkuPm2D zYUhsL)XoNGxu27gFq{R#93x!(&Hf?P}DN6{-0p7AoSswlH} ztir7MsHe{6s7wkIEBOvX1;taRfWHZ7sh;dbNmactDduD4gt+r6Oj+&4W?!?2Cz3f{ zY*Na{y|@<}l`x{`c5FAs)_C#RPscG!%nR|&9_`1Y1W{Ly$oRU}8b!HfEr{1dwyyQX za4**MiA7CKO>BMJJa)6yp5Qe@PjYygL#a(l4YLYOU(Dv{dt3>+V^-R6X+IC*f1Z}0 zFL9l5IL+Y}45XtBm*_L?{air57SByM0vVCMnW~VX7 zMs}eN8RO;>Yd76(bpYnW~)l(9zE1d_p;*;23rY&BcP9+@U|khQQ@P=?rY_9)n4wt}q$JHl46$H0y< z(j{qYven8(A88Y}0z2d>Ma0!ok`yb{6!}G5R2Pt|;&iZ_)QP*m4~u`*Gh~&k{0d5? zB6C@M8{AcRjy0!awmK4I6{GTLncLum3H335%m9gd30 zaE#mo)&vXrA>2hyhzCAN=knSv8ZR@;h`H$_+k z=JfK$?EOf<ENrxheWRUHt6q{EF|@#sF(<|K4fbFG$rRv4e8^? ztaC{GQMwe8RF~xd|0(;fj^^v8nMj)B2_xA*#tcS_9tc*S!PszeFh#TX%F9WZs`kLp zL|Y^Ip&S)9npv%uhmyzY->*!*+M9%&h%_rnS)>~OX@>_@`r97}G zNvS6MO15_XJegb;Yh&&Cs~))WB7|qNr(zw1oELW%HjobCi91U_`hP?GW89U@KLL~T zo2i7&_2HzuSN;SBeFm^qT#2tI&xqgS<;_zl?m_5ASO=i__)B1N!K1o4%yipGu#}sS z3$daq7R22AQ-se&qDrfT!jn=3FqTLNnx7Ovn-7g&7iX*b8{ffD9YFK)4PbIAH`qkK zk+Ae*{9P3C2Fx+tZf)TgMN9RT;!aHH2Zto9L?so3-w-cWcZF|&qAC0m7=BsYt6p5# zjgdZ#glUTizbR^KPQ}i{I7g1ZNBA2UiU&Qr`JKS1_^GCOeiF4Y0L{aXAm<1vgrlOe zHo64A!0hYifaTPqWBqzxA~k%%;4>i5Jli;Cei9$aJz_#{}OEX%}gfBXEZ;#c^Op!X~2oqHl+1JK){rrC0l(K#C)Y@hKjT9hg~R>q4pKnKnx;EoSN|5G~sl}2{}Go z@sYi-UN7*#fCqkPrP-#sHAEyAi$Eq{l1|jb<8fiuSBgdT)+C-YpGJ5N!M*M7y^R&^Su~^C!j_6%gE0loI+TQ@Cw4K2(JOq0Q?<& z=bI5a5IPauJzqd}1tE)Y9RX{O-$cNR%1?^rO?AW(y-nK`;+MsRrbXcnKU`&1_51x^ Gf6;$R@APQ^ diff --git a/deep_autoviml/preprocessing/__pycache__/preprocessing_tabular.cpython-38.pyc b/deep_autoviml/preprocessing/__pycache__/preprocessing_tabular.cpython-38.pyc index cecbfccab4d05ff2ec85b4cc34abfaefc2eaac7a..d5812c96b987f25f1079e3221eaed556cae59260 100644 GIT binary patch delta 9823 zcmb_C33OZ4mH&N7U*Foi%S&u5Cvg&Gah!eMNJ0W3A&DJGOdLn{lWp0uWIcUnah@Uv z8xjIeOkOBZXmC0N>J~~sGo7V9Wy-V+ZD)FD%P_4<50qiL%xQblcFLL30(0;Cl9z;( zbk0cU-~ZlycYE*Md*34c?VIYQ->IHGZnr~$&!fd}wme85@>G!jdSTAz)hUN)9bGU; z8V}HgbkU&F*i0AGCGb8-4~r{m*_<_WEz~GILf6v`Ko_J(>CJQ_l-lSfx*6KqX&>D} zw+<4q*fM8TJ#7G7gdU@N=$$|nrFYT2fa#(8Xd}F1l+*pR30fFcC<(_K<1|4J&}QKO zv}IaT2Mqz$QF@$q(k`Itqe&X3-B3CP^tIuA-HnZnM4=HH1v*aqX%CG-YhsH+8I2Dr zG#>RXWS#J2xrA zq^U7cJxuzR^9o*FrqD%QWLRVGh@HhXx!w$JDd6nPD+Y_#CQTGy+hqgPaig4X=ha-{ zy9TU@ZCnG~WkaVc04K%qU3^<-E-flk+N@q>fDk3J(3oQK(3N!6&_u4)V_aDti^;=h zjkx*ti@?prkBUp@T*(T}m~jPEuHn@t@m)%!lqmQG>{@Y~*YA9n@7|;|eJN4h_bGAQ zJIf|Di&5_k@^$f=cTV!_V7Sh{eqAZ+va=gb1)su`>j}AI)zk-y`{@+r$N(3alW*goECY7#mP!9L_F-3*;|uRMDBVNF50i2WR+`&C1N(S#3=L4Jf zg3VjI6rl1g1#@nduH%K$ZJ<&%^Sy0Z_dsI~W|)d#9y6hORt8d5clat(-!v6qJEW zFH;gN!>-tE()=PA@>WUtN`m%1ZA|T)2K3V>($hO&j(3Vz%cten@HS2%c!FSi5IVd| z{Hc6ej+<3LKX-E~R#sHOl)w}e)^j>(id`{7^h_z+OH)3 zl3niDeZb?*IGl%F5yKT_b7t@W?=DjSMgT&8>75YxQN9AgIx53j8}_h&0qJ*%*DI#Y z!_rMS;}FXgW3jxKck@V_I!T`=)+we}9!~BnQxXg`$E0R;IG6R)qC`AWo52-zA?^Z? z(fl&SOi0k0XB3aj4P(Yv9=p!=sl%Yd5b1xyoQ zdg%eA9Z-{rLx5@q)S)aYaX8W=MOcUP*j6YV1YuADx_}hcGC^4DL}5}Ey-L<}DFI}~ z4|3bctcwZ_LJZ}xFl|fp(qwz$5M7iwZ1`y#@0Aw%X*)jzr9*&Q#1BL1FkP8ALOT*i zX(-XhdlScCY!Gza!6P_>h{rL@V{h^v9J55iDDIrikMX0lb4cY07$F|n zPrGP%P~knvO*mx7X}4LGwIqm14iHlqh{|V0dL#cR=+3s2-H65s#`1nAP;CWa8dQs_zALgBjI>Sk9;hSNkMNquQ9ukMMpz&~Cvs zpZ2s{22_x7U%Qex*;mW&gK(d|Urn4!+|N&e33Rw1mfRWf@RK8MV;;YMfM7%ca_6L# zFve{YBL{kSGBdCR?28a@4C4({ltrdy+RPrl!cWSvEsHqh=$(WvlmUI*P(~COS$nDC zRmzc>m+=x_-e!S)hY`4}DvHAgAMqRCzSNfLzp*EGV6`Uh3r1R_G)PzaYN)R@U) z`?JzPsB5BBF+8JaK`pZ2XKSdv!?!8a9rtYr2bodr+Zc@;f&yA=M15`HXuw#vu5W|* z_1tOVReu%PE=zaMo^5I9+bOos@}-`c?NH~>MPa}b>Fo}(P^+&s8t(0m_`<<9!xxDA zLUCU_6lo2b0{hm0kdnEyH;6j61sPCCO}-u$>|xQ?U>vC1eL=>eEN!JM+LKy5ZxJES zijLY3Mf?1j$yNdzy6guAT(QGb4^MnHfObf=O;ZzulTO7Tohm#Qc&zZ)+N@m~I4^lf zjalmzwiL)mHTD3!(rTn9ZH2*Kp0)v8u#}xcES8rp9#zw7ZC#oiksLE6WXrJKu{s=z z8?Djqp0#~oNn8V@rl?A}(&rZeyH`}7nZDBoNNL_wC`j|lC!5!%UTchF4yjs}8}s5_p`PaMpb;=- zFJ!L(d9j$kvY7m{*u0XHuclsJ*-nyH)k8{21u0d%B#(HAMpQ-p$f{~k!XeT>a#&29 zmci*C>8gX^R|AebfL@yeICY{zO(*&oeU=I(As5n0dW%W#A=bHBCSc5{u_Bu;#aq#s zan{sVR_`KC59lgT9bkhlJZl?BS*mp{C3Bv_DR>NkUtkw)oAu`DJfG@gH$!@*2u^0ahu}ECs}4aaeT1W~biY{7Y+j zCBz1+LV(UKom9$oSk7(hneM}esR*WzkxZs@TEzKn^H;ECT3fLoYuz~RCm z2`jf!Le(LXuyJ)nJ*l)2YN6Ia9TKm`wGoxtpr**0ZN$Pgxq-5AT!HP}4%t`R!RDc{ zMC__xYR?RW_+VGLzi-72MuVvCTa*0cL7EL6U@c(3-^!fuVr~R-%;a$CtPY1lPOluQ z3((%^3P+<|y*h&-M?+mfO#& zfkmkoA|I$?B>9b8$g+{B;cE#PtsOyHE1urjtL6|-ofXab%Zq1Kr(`?JoN? z7$OIm)#6BJdVjUEu$hy4KLA|moZu0I1)8JLLu>&yo6bTYQ?i9vUxXkt1W>{e5M<$e zvPe9zyGC6`#EZKlYM?IL;Wxw=cD9K*_4}{eyQCPY58puV`m??BTdtkQkLd|pik435 z!_|hTYcl>867MdkB(36PLzTLcq(0a1Q@i$WIEHVC^ZPfD=foTP8-#DYl{_HkH`S2W z#h#{~)vHnd8UTVrlz6+fRV< zZE?PR3HeFtTkQolvR3>!yz;tJG*{Ghci+Gi&9j05GTtm^XU9X$ezmj7grs!@;vKMq z$U5286U2T1h8BWU$fJ2s^t!U4rLrH0xsgiJA#RJ*Y^65CGGaAs+ya|M+eKCGNbQE! zY46lKb4)z9!dc!MGZi@}z4nMp1szAFo}I z2F~T!&8bjOima*%t77K+N6oFPrUB#@SLg=uD;zj zFl~d`X=9f``*_|MotQT;VM^y6y=E$}im}NV!f7VS?}6ly_<3I?`QEvEPNmKqi(cRM zl<=O2@B8HR{?}Q%ee1KN@SY zdk8y_I{x4WVqq^S;?<#>?CfP=$+e099y(UgJl5Zr0O?oRL2=?tsfHAJHu2<{_2kFm z?K35H{|X41T+%ich`m7V`xeu&>T_g*N^JiP3TmN&m!np?-a8u0g5>>-n3JZ}7GovE+kL84Q zh@->XN`8kV9Y74JeJ}FtLr@1vvQYeb*qNMye7I#-KvMx>A!xp(RU!}xpJI40?{0)A z{wOrVQBi}(4v!8`4p-!5awOs47Vdxx11BX37u@g>*wX(1_ALrkai?Ms?u5+XHdO9} z`)%0S3|W#1kBO$XE-SkXH$*5P7p@F*fi%~!7*?towh=8urDYOOo?$mJ+y$v8kJ=&i z))kHBxi9o92`#)G?GK)rpQsbeD#(;` zpj>D;cmXe@E>K(q7rxSPP-#b%AnB@JQ#V{r61LlL(oBcB12438;_;d0CTxZa2Zg&} zVDh*PNIPhLM5P6^a7asdCilSXSFFfzk9fGo?IpM+%;7nZW{X76xr#+AaE?#He6gnx z+=kU^1iP`i2}|%O>=J@U0rYJbm(LX^X{b9GiHD*QII%|HI9uzh?}+w>>DY-JPPi>W zUm$Z@JrrP}Kub8du7NE_nM)90s(=IK$K`fu7AAOqE?a=Ax+Vi6O&y(+Mz-M#T(bFjN5NGY74nS%(}CeFh(q0tWK zz~*o;!py}172JVfCjvJD4}x478h4&A)TUu*bcpErvITHfBI<{l#RY4HPRd~o$*vfA z5>pB*heL*|3y)94BVIT^wKfj&|iLt_Fab0dO;p8CNGMThv#j3 z3g%k_Cv)%|?tnOO7^-2RcD&CuV4;PD*q(_4CoaS$j_VK!PO5`Lx;TWN@42T5op-$k*<+-L-I`q7&rb3eO3i3#mbHf z*JGU(uOW-v;Wk4FRsoI(*+C{<{nBFvD@KvcTyw18;O5nU^&^*s9Rn4uQXsJ52Y1?I zK73hRd2DL(9yqxYZUqNJr?OEe?EWCzQnoLL1^)Pq(ou)%h@EzpIW-sA= z-0PN(<1)uJDpovRvv{(XG#0kN^o*yb=6I+tI3X3uEcCv(@OX{)1)L1DRDp@`!gRQZ z)lx2g^!Rkrf9{FX?mje0Gmk;9iASDT@03nDVM>4hL?IlcTo)FQ!E;X>6pw#qcC}=( zWtgy^(k0{u0{w!WkiPJT&lHgVl#KU^wHHElvR64b5)OVG5wyR;0f(Ycq^H-2zb8Jp zP%`&EXql7+OF-Y)Z_*c_HJb&S-jO-*vbgifd3!#a%^un0@qf>O;4m^9#@-gMJz1Q* z;VcO03f#_2SJXw_@Oy_%{=kt(JripAkQwv9#dBn?gx{S;oIDRau>eXg-1_hWaO*;# z%15fCBx{gl`9M}QMpgu5#gl}c$udbE_)P>v7xLnEbaZdTW!Br_+JCH8Y}U9Jf*|)} zb81pye>SQ9zg{h~e?aHLUGrG_6Cb$jGhPPFeAs19ye-b&7zD!Mi~|YZnVMA)4S(^~ zG5aT}7H^5`P*vSTRRxOQJr=ymHK7g5LS+&ipUnZ_FS!x#zxw2f!EoKhpafgtj|xg- zp4nAn>?*o3N8T!jX2A7(V*$3B*Us6i>PDw*m&gS&+@^@r9t?)#U(S}tCz~)F9YvfY9eRF`lkgM7Oy`& z&vp)2P6+Lpup5nn%RYf}!y@^NpA3r8XEv`#3Wlemw7c27)@%*Jv^9^pJM0vY1XpP` z45fY?E-Ta^uUpu#sQui-8^(4zew%scA@?G3z{YpEn(2t+JCN~x?EAtxI)j`Nb4LBs zp9CF#f{PEn@U{|OIH}{WTf~vko4hB|S`Yjj5{R=X+SZNW%Zhk*bRqd>>aEcxLJp_a ze4&{nU&WR(kig!;(rhffjU~LDWy`R-62a?OmEQ6bEaCkU!`l~z7Zmb-fPEKBcO&ru z1VIFNIF-jvhWll<9f5)1D1u{T5x{;d4kEZ00WK-*Bm#6mb{0x$+wFm)aNUJN%$^jt zJ~xBBB6^-XzHPalr%!b(09dEba?Eqg(ib^u^aW7EuckX34qexCl#d*1brs+S!0Gxl UJzp@qLN!#DoIu9)=PkJD}@GJ zn283jqHUuFMFnA6vCy{11@%)}TgB0~?dckOX0$h*=~idj|Xgqp9@8aiz^Y1~3< z>Ga)7qlaEWXTW_c?Gaz9uBu+TfG&h~O1IHPv<^5QT}+pNMho3euck{u(n^=nYj!K* z3PaVr9kiaV0a<{qrRzW|NY~Q_xOdY&+DJD-ONc6zbh1W9chXICGf2X83-v(%`wgXQ zIc=luAlpGB)K5Er>jVAj&JA6Sjg3U1owN(A?WFxQK!ecAmnpQHhIT777M+PSs`(YDE`yoIhf2gA!01lp#y#OWinZ;?{#CQmC7G;Q*oO9mb&)b~7)$ zO4zry+6F6KT(sE0q$X)Du zA+Jwll5`D*q-z_vE(X`dN!M1kju(ULXD@J_!cGjOgdXhMt=s%|n#b#DJ}nqDMApEF z)w`6)dc80i&U*1e?o`uEh+&cVGPfj(+{MwmCv+937r@IN za*Ma~U8@}IJKio+@bfmAg8#e}+J+n!Gi!UZXR;7?GVRWQBznt1-TJl1)~LI9q;0Sk?~Ce!WqgIy9&2O z*TZsR;5j5ct3#=5GR=&H1Jy}cW(Ufy;~|>vQW7%-t!aAJ@FX5ONJb>OfioV?M1bjB zrwvdJQ`uqNMK?f;P8;)~Wk8MefMg>`dgvz99Z;i@UJz{tQSXQ-vMtapO&Gz%SHNwN z#=ZhQk2JRRJY(MTjY(TgV{A>37K}q_VLLYsR~%GmGpyY-mP=`*hemyoUOGLpO|PVs z_s9qe4vXiH=}ZH?@pZIM3S6WIx4)14fzG`5AVfY#~=4=hc}am>=# zZM+-DEE3VP{8fA>-$C04RUUy6;(-D=e%%L2^is{>t{+Uz723yqeynD6j4HUk30#bzA)Uzk2Jx(1|NS~Du6l&bo8eG1>s`}s|+2Hf0fXRBdA1tYtmV(sax z=DT5GPuQeJZi(E=Z-Eg0VJJQHLcq!Q3_J8{e(M0i1%a{)cF!(uI)9--a(j~Fn!)n) zl^P9Tkp>s8!1!b%W|+Zh`5rmAu7Fhz-yX=p1?wxQE5iy5ttD4URxB6K;dw0vI7@WL zc9rC6WUctGN_UCd8=&r{_CVM@-(5!My1gyBkGb0ex~~=PZQigrJ~4}|6wj8G)acCH zUG1)K3-)wUx8Bwsb_asGyUFMFx?4JfUftd6W$oUkPT#_X?y_)Sok%Y)6{jmTAu4mo z4d8B)C23epu5yb-6DL4xW$Z7NlT|WTlus$iZwWGYi_fd~FrT}d`MO!K*%uDC2U`0U zqA}A_bR#~l%(29clm)xj#O|FkosgsAt?Dymsd#hhM6q<*r(|kuYAq!qvZ-8@ubUcs ze!78>+hQlL*kmF{V+C`!kgA`+0je>*%cpxi?Sbwd-P7!ajHflGg*)22JyI?uwe0W3 z^Ybdn^WyjOI5`k&zOt2UR8=R*B?Tl`%_eEYOjJcZYgDZSe`fqCs-m6M)FRoMZL$(4 z(TEC98vMrU8==2h>ZcLAn75$Vm9V8rYuK8w2CcE41*-_LiuV?-frs)%lvF+fiLf64 zsW3AEx40VUjvG6Jt+jDWpeN8C>hZC+Q0Jt0rf#b117MG;aCnp_X|qc5AfEAreGvO= zU5l|WPPWI%jw6Ka#V#|E%tGP?^1irv=^m0H>Q>~Cp9}Z0TJte%GKz*}G zI~Hu&sO4ZsR*PghkP16HiP~?8j{~`vw3C6&GsL)HdEO}g8ts9eE{QY`K#qN{i2C3L z(}$q>yf_}5K|YNAA(&wz)5LfB+{;dZMXc!Qx@ZbCBe+8p!4$|mBy8jqSjJ9)Tm@z* z+iigKb4VF80dIq`&v63KoRV!HS*t3;mS*rb`j?`JjlDLqRkZe&Eg3fGMsC1840b$= zZt|P`7C#(TQO1q%j$xsuK{cXrjhjKktfvgX;SV`(hKmyBEk2HJ*~F$F#jGZVmS6Set>buqIZFEOEvzuih3nhC@t;vevb;c>I`k zb2r*UAjZv|!C*&EH!BiHcUGI;1^GDf&pY$eNrnpb73#Lx2zQmeE2a(HCSrTbW6$wr zrpr#yW-(`|SiDo4B_ciZ$4*q+mXV2KPhtS0%l*jt%N^%UBJs#YlQ(A7Txw;X0>ev$ zV{aij6`MBj?d2mK6m_?U$3{5oj*$pIlWTc2LUuJqwG_!1sQeThhNZ)w;t<7Rd%G_0 z^hFq+&_zyP8g=TrU|25i^mWjCL^^$4lK`%>F* zbIlF#@NY)(;{HYCeQ{=g&W2YoT&x9g6Y~b(JTx14*k?b2TcfSl+u2Ux)Zn3Ewhk?R zf?|sv_H?#~^+pYgHga-AVd+Yp^{BW(1n#ZK{|&~R27MG_ho1(=gETl)6cDyaJbiET zq-<1O4b2rMwi#}5EA!EwW}m0ayPaK&jr%~#mW%oKEgSbIG=^V>6nPHGBg`Z2y>EHW zr>NoqWeDJU?6U$%nfS|nMbWRZj~V-b^9;haLetV_DL^6oje{!%t|f4V7eh0gK+SMj z;L_kq;fnn5lpC>f1GmC^gpHDj9X>da4l}!jn}-#u;(=Eu+y-U90dI$><O+%^N;t*C0Czv60*ttDZo=)Vc3(tw{6(q7^{1Ui|adERO(&4yEq#U>= z`&$sk9X(;6$Jf>5qZH2T?5rYM4=gLiBHUP_*i*ZWuP|u(2gKJsWO% ziD$0ChG(M|fR>#D=B2(xOntDpM7Ko zv}hhhn;~Mw=myQOsw8l+1i7n_j6uj3q9kRU`22t-77W`+wYYA$ zN`?23ZNm#lmw0ygw}gow9-Ol9R~XDj?2lnXRlfoS-6)WIS`fKeAd6d?JA+|gSPuY3 z!YW7>^i%2RK-l90f8VzZiVDY>tjy5=9%anRq)hp0yj84(6MsG3bJS zI4+~0dsh7W!{fE>amm;ey~YJIW9h&-YE2ts7Xk=zR8 zvl`^8#S4!VtZN@m1zyhjC~zul3BfrSNPHlk5{ENcMC9QZH;zP|kp;t%7{XRjeW)z& zLIA|Ay}@R0lPBEX=M&yT*=gqhA+eB{{QD1OMPS}?4@uxHDXV1#YUDPBD^f%J(# z9NOy$LnyF6%Q#*W{f{oPNdzY6@6AWc$%Ep{N2if~(B3L0KI#y6K2}&Fdzq5GFmMTg zeaImS*iP}zV`bz?*)uB29&fLaNu~U6fQ~PrOy;mIJ@EqS!S{qj`Ft5O!YZq%2qq1V zq;@D`Y!^Za+D8AdeqmB_AO25ZM(Wq*xU61>#r(rlR^0#puU;eI3H_ruc{nSI+vz3i z7eFo*Hda|JvF^b~%Mq|)J%Z=adSv@Q)+0cfgcXz>M-qVvz=@Khg%%un)=9+4$MeLQ zqgM0VIE0IY^Vr1F3>d_b6TRKwG$wWhyDEqDCLB_OxM|%uF>q|Qu9=aM|LiCm&iY z<9`JWypQBnAaO%?u$#Szk1rv?c-Za8Nw8KS_cur+_O2ECo^s8RC>wWPB)>z2Pm%lw zl24F)gXCKzUjm7%oniK>_{~!#xvRiK+yq~M^e`NOfj3egd{85z;P_->64xHzOl}vC z9~We=nD_J))8B&1Jkj-Zrvs0c@PT3wZj6fGKV3oKe`v9*&8Wfd5*uTMg?Au33!Ux- zZY~@d<+&W-YhZiDJ+XW0#-j#S4miBZ4FrcZF|jgXnVUJWg3fIV=hl8=+4r9qM;42_ zPE<_z0h%Im>w+78O@SLu;JMoz@h>NevUkPJ-SA1%8)iZD*oDMF#JFc_$&0b2&uk{7 zJ9g+<4~f2jEp`m^1LTU4!*4lEuGZMwBgs}Z4}j4P?pROPI7k!IF3ZM>GQWx$gI Ot{QmF(5y4Ewf_cSM8|#r diff --git a/deep_autoviml/preprocessing/__pycache__/preprocessing_text.cpython-38.pyc b/deep_autoviml/preprocessing/__pycache__/preprocessing_text.cpython-38.pyc index 7b82048c99e524586bb37a9c20d5f512e9707eb7..07413b2f2812692ef267f620415c2223064cfaa6 100644 GIT binary patch delta 280 zcmeBGe5=44%FD~e00hlCj}j#(^2##CY}B5@STB`hk!u-cnQIkgm1`Yk&B&0#kRqL9 zlWQAgn`;+k2j(;8*hkq{iKL2VIV^Alky%a)85tSE8S+#Z89)#y=A7jMl?AgPDk2&3 zm?A(X%H*i#x<axgP7iZHS<$}!0Rc`A%NOaM|~M!5h0 delta 292 zcmaE>(5uKB%FD~e00ce`wTb2vd1V<>Hfm2{tY^%zh_Xy!NRi61%C(NN&b5iM$+eBL zWn=(~N$1$*+DF;vIz%~u`HVS^QI1t2siIj<3!Fh@mdip$Muu>PJQYR;5Cn?3X1PIS z!7PZ1NQOM72#|>~IjXttQSP}OQ69OTQJ%S8R#9HL-cjCQ<7IOsbM>P1a($wFzmKl+<>Tn+`y>7+@PqS$puWJjKQ1RnC5aZhD=uBUCJ1>`4n#v zqhLvDUU7a=T26j>d`W6WNs-B9bN*Y5QImxPTqGGem_!%_7&(}k7)2P_80DB`fIJmO G9wq=T!As2m diff --git a/deep_autoviml/preprocessing/preprocessing.py b/deep_autoviml/preprocessing/preprocessing.py index 23e944f..0e9779f 100644 --- a/deep_autoviml/preprocessing/preprocessing.py +++ b/deep_autoviml/preprocessing/preprocessing.py @@ -24,7 +24,7 @@ # Make numpy values easier to read. np.set_printoptions(precision=3, suppress=True) from collections import defaultdict - +import os ############################################################################################ # data pipelines and feature engg here from deep_autoviml.preprocessing.preprocessing_tabular import preprocessing_tabular @@ -65,6 +65,7 @@ from tensorflow.keras import regularizers from tensorflow.keras.layers import Dense, LSTM, GRU, Input, concatenate, Embedding from tensorflow.keras.layers import Reshape, Activation, Flatten +import tensorflow_hub as hub from sklearn.metrics import roc_auc_score, mean_squared_error, mean_absolute_error from IPython.core.display import Image, display @@ -183,24 +184,38 @@ def perform_preprocessing(train_ds, var_df, cat_vocab_dict, keras_model_type, nlp_names = [] embedding = [] ################## All other Features are Proprocessed Here ################ - fast_models = ['fast','deep_and_wide','deep_wide','wide_deep', 'mixed_nlp', + ### make sure you include mixed_nlp and combined_nlp in this list since you want it separated + fast_models = ['fast','deep_and_wide','deep_wide','wide_deep', "mixed_nlp","combined_nlp", 'wide_and_deep','deep wide', 'wide deep', 'fast1', 'deep_and_cross', 'deep_cross', 'deep cross', 'fast2',"text"] ############################################################################## meta_outputs = [] print('Preprocessing non-NLP layers for %s Keras model...' %keras_model_type) - + if not keras_model_type.lower() in fast_models: - ################################################################################ - ############ T H I S I S F O R "A U T O" M O D E L S O N L Y ######### - ################################################################################ + ############################################################################################ + ############ I N "A U T O" M O D E L S we use Lat and Lon with NLP right here ######### + ############################################################################################ if len(lats+lons) > 0: - print(' starting categorical, float and integer layer preprocessing...') + print(' Now combine all numeric and non-numeric vars into a Deep only model...') meta_outputs, meta_inputs, meta_names = preprocessing_tabular(train_ds, var_df, cat_feat_cross_flag, model_options, cat_vocab_dict, keras_model_type, verbose) - print(' All Non-NLP feature preprocessing for %s completed.' %keras_model_type) + print(' All Non-NLP feature preprocessing completed.') ### this is the order in which columns have been trained ### + if len(nlps) > 0: + print('Starting NLP string column layer preprocessing...') + nlp_inputs = create_nlp_inputs(nlps) + max_tokens_zip, seq_tokens_zip, embed_tokens_zip, vocab_train_small = aggregate_nlp_dictionaries(nlps, cat_vocab_dict, model_options) + nlp_encoded = encode_nlp_inputs(nlp_inputs, cat_vocab_dict) + ### we call nlp_outputs as embedding in this section of the program #### + print('NLP Preprocessing completed.') + #merged = [meta_outputs, nlp_encoded] + merged = layers.concatenate([nlp_encoded, meta_outputs]) + print(' combined categorical+numeric with nlp outputs successfully for %s model...' %keras_model_type) + nlp_inputs = list(nlp_inputs.values()) + else: + merged = meta_outputs final_training_order = nlp_names + meta_names ### find their dtypes - remember to use element_spec[0] for train data sets! ds_types = dict([(col_name, train_ds.element_spec[0][col_name].dtype) for col_name in final_training_order ]) @@ -209,48 +224,61 @@ def perform_preprocessing(train_ds, var_df, cat_vocab_dict, keras_model_type, print('Inferred column names, layers and types (double-check for duplicates and correctness!): \n%s' %col_type_tuples) print(' %s model loaded and compiled successfully...' %keras_model_type) else: - ####### Now combine all vars into a complete auto deep and wide model ############## + ############################################################################################ + #### In "auto" vs. "mixed_nlp", the NLP processings are different. Numeric process is same. + #### Here both NLP and NON-NLP varas are combined with embedding to form a deep wide model # + ############################################################################################ + print(' Now combine all numeric+cat+NLP vars into a Deep and Wide model') ## Since we are processing NLPs separately we need to remove them from inputs ### if len(NON_NLP_VARS) == 0: - print(' Non-NLP vars is zero in this dataset. No tabular preprocesing needed...') + print(' There are zero non-NLP variables in this dataset. No non-NLP preprocesing needed...') meta_inputs = [] else: - #### Here both NLP and NON-NLP varas are combined with embedding to form a deep wide model # FEATURE_NAMES = left_subtract(FEATURE_NAMES, nlps) dropout_rate = 0.1 hidden_units = [dense_layer2, dense_layer3] inputs = create_fast_inputs(FEATURE_NAMES, NUMERIC_FEATURE_NAMES, FLOATS) #all_inputs = dict(zip(meta_names,meta_inputs)) + #### In auto models we want "wide" to be short. Hence use_embedding to be True. wide = encode_auto_inputs(inputs, CATEGORICAL_FEATURE_NAMES, FLOATS, vocab_dict, - hidden_units, use_embedding=False) + hidden_units, use_embedding=True) wide = layers.BatchNormalization()(wide) deep = encode_all_inputs(inputs, CATEGORICAL_FEATURE_NAMES, FLOATS, vocab_dict, use_embedding=True) + deep = layers.BatchNormalization()(deep) meta_inputs = list(inputs.values()) ### convert input layers to a list #### If there are NLP vars in dataset, you must combine the nlp_outputs ## + print(' All Non-NLP feature preprocessing completed.') if len(nlps) > 0: print('Starting NLP string column layer preprocessing...') nlp_inputs = create_nlp_inputs(nlps) max_tokens_zip, seq_tokens_zip, embed_tokens_zip, vocab_train_small = aggregate_nlp_dictionaries(nlps, cat_vocab_dict, model_options) nlp_encoded = encode_nlp_inputs(nlp_inputs, cat_vocab_dict) ### we call nlp_outputs as embedding in this section of the program #### - print(' NLP Preprocessing completed.') + print('NLP preprocessing completed.') merged = [wide, deep, nlp_encoded] - print(' %s combined wide, deep and nlp outputs successfully...' %keras_model_type) + print(' Combined wide, deep and nlp outputs successfully') nlp_inputs = list(nlp_inputs.values()) else: merged = [wide, deep] print(' %s combined wide and deep successfully...' %keras_model_type) - return nlp_inputs, meta_inputs, merged, embedding - elif keras_model_type.lower() == 'mixed_nlp': + ### if NLP_outputs is NOT a list, it means there is some NLP variable in the data set + if not isinstance(merged, list): + print('Shape of output from all preprocessing layers before model training = %s' %(merged.shape,)) + return nlp_inputs, meta_inputs, merged, embedding + elif keras_model_type.lower() in ['mixed_nlp', 'combined_nlp']: ### this is similar to auto models but uses TFHub models for NLP preprocessing ##### if len(NON_NLP_VARS) == 0: print(' Non-NLP vars is zero in this dataset. No tabular preprocesing needed...') meta_inputs = [] else: + ############################################################################################ + #### In "auto" vs. "mixed_nlp", the NLP processings are different. Numeric process is same. + ############################################################################################ + print(' Now combine all numeric and non-numeric vars into a Deep and Wide model...') #### Here both NLP and NON-NLP varas are combined with embedding to form a deep wide model # FEATURE_NAMES = left_subtract(FEATURE_NAMES, nlps) - dropout_rate = 0.1 + dropout_rate = 0.5 hidden_units = [dense_layer2, dense_layer3] inputs = create_fast_inputs(FEATURE_NAMES, NUMERIC_FEATURE_NAMES, FLOATS) #all_inputs = dict(zip(meta_names,meta_inputs)) @@ -259,20 +287,27 @@ def perform_preprocessing(train_ds, var_df, cat_vocab_dict, keras_model_type, wide = layers.BatchNormalization()(wide) deep = encode_all_inputs(inputs, CATEGORICAL_FEATURE_NAMES, FLOATS, vocab_dict, use_embedding=True) + deep = layers.BatchNormalization()(deep) meta_inputs = list(inputs.values()) ### convert input layers to a list + print(' All Non-NLP feature preprocessing completed.') #### If there are NLP vars in dataset, you use TFHub models in this case ## if len(nlps) > 0: print('Starting NLP string column layer preprocessing...') - nlp_inputs, embedding, nlp_names = preprocessing_nlp(train_ds, model_options, + nlp_inputs, embedding, nlp_names = mixed_preprocessing_nlp(train_ds, model_options, var_df, cat_vocab_dict, keras_model_type, verbose) ### we call nlp_outputs as embedding in this section of the program #### print(' NLP Preprocessing completed.') - print('There are no NLP variables in this dataset for preprocessing...') else: + print('There are no NLP variables in this dataset for preprocessing...') embedding = [] - meta_outputs = layers.concatenate([wide, deep]) - print(' %s model: combined wide, deep and NLP (with TFHub) successfully...' %keras_model_type) + if isinstance(embedding, list): + ### This means embedding is an empty list with nothing in it ### + meta_outputs = layers.concatenate([wide, deep]) + print(' Combined wide, deep layers successfully.') + else: + meta_outputs = layers.concatenate([wide, deep, embedding]) + print(' Combined wide, deep and NLP (with TFHub) successfully.') else: meta_inputs = [] ##### You need to send in the ouput from embedding layer to this sequence of layers #### @@ -348,13 +383,13 @@ def perform_preprocessing(train_ds, var_df, cat_vocab_dict, keras_model_type, print('There is no numeric or cat or int variables in this data set.') if isinstance(nlp_outputs, list): ### if NLP_outputs is a list, it means there is no NLP variable in the data set - print(' There is no NLP variable in this data set. Returning') + print('There is no NLP variable in this data set. Returning') consolidated_outputs = meta_outputs else: - print(' %s vector dimensions from NLP variable' %(nlp_outputs.shape,)) + print('Shape of encoded NLP variables just before training: %s' %(nlp_outputs.shape,)) consolidated_outputs = nlp_outputs else: - print(' Shape of output from numeric+integer+cat variables before model training = %s' %(meta_outputs.shape,)) + print('Shape of non-NLP encoded variables just before model training = %s' %(meta_outputs.shape,)) if isinstance(nlp_outputs, list): ### if NLP_outputs is a list, it means there is no NLP variable in the data set print(' There is no NLP variable in this data set. Continuing...') @@ -362,8 +397,72 @@ def perform_preprocessing(train_ds, var_df, cat_vocab_dict, keras_model_type, consolidated_outputs = meta_outputs else: ### if NLP_outputs is NOT a list, it means there is some NLP variable in the data set - print(' %s vector dimensions from NLP variable' %(nlp_outputs.shape,)) + print(' Shape of encoded NLP variables just before training: %s' %(nlp_outputs.shape,)) consolidated_outputs = layers.concatenate([nlp_outputs, meta_outputs]) print('Shape of output from all preprocessing layers before model training = %s' %(consolidated_outputs.shape,)) return nlp_inputs, meta_inputs, consolidated_outputs, nlp_outputs ########################################################################################## +def mixed_preprocessing_nlp(train_ds, model_options, + var_df, cat_vocab_dict, + keras_model_type, verbose=0): + """ + This is only for mixed NLP preprocessing of tabular and nlp datasets + """ + nlp_inputs = [] + all_nlp_encoded = [] + all_nlp_embeddings = [] + nlp_col_names = [] + nlp_columns = var_df['nlp_vars'] + nlp_columns = list(set(nlp_columns)) + + if len(nlp_columns) == 1: + nlp_column = nlp_columns[0] + elif keras_model_type.lower() == 'combined_nlp': + nlp_column = 'combined_nlp_text' ### this is when there are multiple nlp columns ## + else: + ### This is to keep nlp columns separate ### + nlp_column = '' + + #### Now perform NLP preproprocessing for each nlp_column ###### + ######### This is where we load Swivel model and process each nlp column ### + try: + bert_model_name = "Swivel-20" + if os.name == 'nt': + tfhub_path = os.path.join(keras_model_type, 'tf_cache') + os.environ['TFHUB_CACHE_DIR'] = tfhub_path + tfhub_handle_encoder = 'https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1' + else: + tfhub_handle_encoder = 'https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1' + hub_layer = hub.KerasLayer(tfhub_handle_encoder, + input_shape=[], + dtype=tf.string, + trainable=False, name="Swivel20_encoder") + print(f' {bert_model_name} selected from: {tfhub_handle_encoder}') + ### this is for mixed nlp models. You use Swivel to embed NLP columns fast #### + if len(nlp_columns) > 1: + copy_nlp_columns = copy.deepcopy(nlp_columns) + for each_nlp in copy_nlp_columns: + nlp_input = tf.keras.Input(shape=(), dtype=tf.string, name=each_nlp) + nlp_inputs.append(nlp_input) + x = hub_layer(nlp_input) + all_nlp_encoded.append(x) + nlp_col_names.append(each_nlp) + else: + nlp_input = tf.keras.Input(shape=(), dtype=tf.string, name=nlp_column) + x = hub_layer(nlp_input) + ### Now we combine all inputs and outputs in one place here ########### + nlp_inputs.append(nlp_input) + all_nlp_encoded.append(x) + nlp_col_names.append(nlp_column) + except: + print(' Error: Skipping %s for keras layer preprocessing...' %nlp_column) + ### we gather all outputs above into a single list here called all_features! + if len(all_nlp_encoded) == 0: + print('There are no NLP string variables in this dataset to preprocess!') + elif len(all_nlp_encoded) == 1: + all_nlp_embeddings = all_nlp_encoded[0] + else: + all_nlp_embeddings = layers.concatenate(all_nlp_encoded) + + return nlp_inputs, all_nlp_embeddings, nlp_col_names +################################################################################# diff --git a/deep_autoviml/preprocessing/preprocessing_nlp.py b/deep_autoviml/preprocessing/preprocessing_nlp.py index 17a345b..fc18fc2 100644 --- a/deep_autoviml/preprocessing/preprocessing_nlp.py +++ b/deep_autoviml/preprocessing/preprocessing_nlp.py @@ -123,7 +123,8 @@ def preprocessing_nlp(train_ds, model_options, var_df, cat_vocab_dict, keras_mod 'wide_and_deep','deep wide', 'wide deep', 'fast1', 'deep_and_cross', 'deep_cross', 'deep cross', 'fast2'] - max_tokens_zip, seq_tokens_zip, embed_tokens_zip, vocab_train_small = aggregate_nlp_dictionaries(nlp_columns, cat_vocab_dict, model_options) + max_tokens_zip, seq_tokens_zip, embed_tokens_zip, vocab_train_small = aggregate_nlp_dictionaries( + nlp_columns, cat_vocab_dict, model_options, verbose) if len(nlp_columns) == 1: nlp_column = nlp_columns[0] @@ -360,7 +361,7 @@ def encode_NLP_column(train_ds, nlp_column, nlp_input, vocab_size, sequence_leng #print(f" {nlp_column} vocab size = {vocab_size}, sequence_length={sequence_length}") return nlp_vectorized ################################################################################################ -def aggregate_nlp_dictionaries(nlp_columns, cat_vocab_dict, model_options): +def aggregate_nlp_dictionaries(nlp_columns, cat_vocab_dict, model_options, verbose=0): """ This function aggregates all the dictionaries you need for nlp processing. Just send in a list of nlp variables and a small data sample and it will compute all @@ -380,20 +381,24 @@ def aggregate_nlp_dictionaries(nlp_columns, cat_vocab_dict, model_options): if len(nlps_copy) > 0: vocab_train_small = [] for each_name in nlps_copy: - print('Creating aggregate_nlp_dictionaries for nlp column = %s' %each_name) + if verbose >= 2: + print('Creating aggregate_nlp_dictionaries for nlp column = %s' %each_name) max_tokens_zip[each_name] = cat_vocab_dict[each_name]['size_of_vocab'] print(' size of vocabulary = %s' %max_tokens_zip[each_name]) seq_tokens_zip[each_name] = cat_vocab_dict[each_name]['seq_length'] seq_lengths.append(seq_tokens_zip[each_name]) - print(' sequence length = %s' %seq_tokens_zip[each_name]) + if verbose >= 2: + print(' sequence length = %s' %seq_tokens_zip[each_name]) vocab_size = cat_vocab_dict[each_name]['size_of_vocab'] vocab_train_small += cat_vocab_dict[each_name]['vocab'] vocab_train_small = np.unique(vocab_train_small).tolist() - best_embedding_size = closest(lst, vocab_size//4000) - print(' recommended embedding_size = %s' %best_embedding_size) + best_embedding_size = closest(lst, vocab_size//50000) + if verbose >= 2: + print(' recommended embedding_size = %s' %best_embedding_size) input_embedding_size = check_model_options(model_options, "embedding_size", best_embedding_size) if input_embedding_size != best_embedding_size: - print(' input embedding size given as %d. Overriding recommended embedding_size...' %input_embedding_size) + if verbose >= 2: + print(' input embedding size given as %d. Overriding recommended embedding_size...' %input_embedding_size) best_embedding_size = input_embedding_size embed_tokens_zip[each_name] = best_embedding_size return max_tokens_zip, seq_tokens_zip, embed_tokens_zip, vocab_train_small diff --git a/deep_autoviml/preprocessing/preprocessing_tabular.py b/deep_autoviml/preprocessing/preprocessing_tabular.py index 46c82af..7c260b0 100644 --- a/deep_autoviml/preprocessing/preprocessing_tabular.py +++ b/deep_autoviml/preprocessing/preprocessing_tabular.py @@ -327,7 +327,7 @@ def preprocessing_tabular(train_ds, var_df, cat_feature_cross_flag, model_option except: print(' Error: Skipping %s since Keras Bolean preprocessing is erroring' %each_bool) - ###### This is where we handle Boolean Integer variables - we just combine them ################## + ###### This is where we handle Boolean + Integer variables - we just combine them ################## int_bools_copy = copy.deepcopy(int_bools) if len(int_bools_copy) > 0: for each_int in int_bools_copy: @@ -361,16 +361,24 @@ def preprocessing_tabular(train_ds, var_df, cat_feature_cross_flag, model_option else: nums_bin = max(20, int(max_tokens_zip[each_int]/40)) int_input = keras.Input(shape=(1,), name=each_int, dtype="int32") - encoded = encode_any_integer_to_hash_categorical(int_input, each_int, - train_ds, nums_bin) + if (max_tokens_zip[each_int] >= high_cats_alert): + encoded = encode_any_integer_to_hash_categorical(int_input, each_int, + train_ds, nums_bin) + if verbose: + print(' %s encoded: %d categories, %d bins. After integer HASH encoding shape = %s' %(each_int, + max_tokens_zip[each_int], nums_bin, encoded.shape[1])) + else: + encoded = encode_categorical_and_integer_features(int_input, each_int, + train_ds, is_string=False) + if verbose: + print(' %s encoded: %d categories. After integer encoding shape: %s' %(each_int, + max_tokens_zip[each_int], encoded.shape[1])) all_int_inputs.append(int_input) all_int_encoded.append(encoded) all_input_names.append(each_int) if verbose: - print(' %s number of categories = %d and bins = %d: after integer hash encoding shape: %s' %(each_int, - max_tokens_zip[each_int], nums_bin, encoded.shape[1])) - if (encoded.shape[1] >= high_cats_alert) or (max_tokens_zip[each_int] >= high_cats_alert): - print(' Alert! excessive feature trap. Should this not be a float variable?? %s' %each_int) + if (encoded.shape[1] >= high_cats_alert): + print(' High Dims Alert! Convert %s to float??' %each_int) except: print(' Error: Skipping %s since Keras Integer preprocessing erroring' %each_int) @@ -384,16 +392,19 @@ def preprocessing_tabular(train_ds, var_df, cat_feature_cross_flag, model_option int_input = keras.Input(shape=(1,), name=each_int, dtype="int32") cat_input_dict[each_int] = int_input vocab = max_tokens_zip[each_int] - encoded = encode_integer_to_categorical_feature(int_input, each_int, - train_ds, vocab) + #encoded = encode_integer_to_categorical_feature(int_input, each_int, + # train_ds, vocab) + encoded = encode_categorical_and_integer_features(int_input, each_int, + train_ds, is_string=False) all_int_cat_inputs.append(int_input) all_int_cat_encoded.append(encoded) all_input_names.append(each_int) if verbose: - print(' %s number of categories = %d: after integer categorical encoding shape: %s' %( - each_int, len(vocab), encoded.shape[1])) + print(' %s encoded: %d categories. After integer encoding shape: %s' %(each_int, + len(vocab), encoded.shape[1])) if encoded.shape[1] > high_cats_alert: - print(' Alert! excessive feature dimension created. Check if necessary to have this many.') + if verbose: + print(' High Dims Alert! Convert %s to float??' %each_int) except: print(' Error: Skipping %s since Keras Integer Categorical preprocessing erroring' %each_int) @@ -408,8 +419,10 @@ def preprocessing_tabular(train_ds, var_df, cat_feature_cross_flag, model_option cat_input_dict[each_cat] = cat_input vocab = max_tokens_zip[each_cat] max_tokens = len(vocab) - cat_encoded = encode_string_categorical_feature_categorical(cat_input, each_cat, - train_ds, vocab) + cat_encoded = encode_categorical_and_integer_features(cat_input, each_cat, + train_ds, is_string=True) + #cat_encoded = encode_string_categorical_feature_categorical(cat_input, each_cat, + # train_ds, vocab) all_cat_inputs.append(cat_input) all_cat_encoded.append(cat_encoded) cat_encoded_dict[each_cat] = cat_encoded @@ -418,7 +431,8 @@ def preprocessing_tabular(train_ds, var_df, cat_feature_cross_flag, model_option print(' %s number of categories = %d: after string to categorical encoding shape: %s' %( each_cat, max_tokens, cat_encoded.shape[1])) if cat_encoded.shape[1] > high_cats_alert: - print(' Alert! excessive feature dimension created. Check if necessary to have this many.') + if verbose: + print(' High Dims Alert! Convert %s to float??' %each_int) except: print(' Error: Skipping %s since Keras Categorical preprocessing erroring' %each_cat) @@ -487,9 +501,9 @@ def preprocessing_tabular(train_ds, var_df, cat_feature_cross_flag, model_option all_num_encoded.append(encoded) num_only_encoded.append(encoded) all_input_names.append(each_num) - print(' %s numeric column left as is for feature preprocessing' %each_num) + print(' %s numeric column left as is since float' %each_num) except: - print(' Error: Skipping %s since Keras Float preprocessing erroring' %each_num) + print(' Error: Skipping %s due to Keras float preprocessing error' %each_num) # Latitude and Longitude Numerical features are Binned first and then Category Encoded ####### @@ -617,9 +631,16 @@ def preprocessing_tabular(train_ds, var_df, cat_feature_cross_flag, model_option meta_input_categ1 = all_low_cat_encoded[0] meta_categ1 = layers.Dense(concat_layer_neurons, kernel_initializer=concat_kernel_initializer)(meta_input_categ1) else: - meta_input_categ1 = layers.concatenate(all_low_cat_encoded) - #WIDE - This Dense layer connects to input layer - Categorical Data - meta_categ1 = layers.Dense(concat_layer_neurons, kernel_initializer=concat_kernel_initializer)(meta_input_categ1) + int_list = [x for x in all_low_cat_encoded if x.dtype in [np.int8, np.int16, np.int32, np.int64]] + float_list = [ x for x in all_low_cat_encoded if x.dtype in [np.float32, np.float64]] + if len(float_list) == len(all_low_cat_encoded): + ### All of them are floats ### + all_high_cat_encoded += float_list + else: + meta_input_categ1 = layers.concatenate(int_list) + all_high_cat_encoded += float_list + #WIDE - This Dense layer connects to input layer - Categorical Data + meta_categ1 = layers.Dense(concat_layer_neurons, kernel_initializer=concat_kernel_initializer)(meta_input_categ1) skip_meta_categ2 = False if len(all_high_cat_encoded) == 0: @@ -779,6 +800,22 @@ def encode_binning_numeric_feature_categorical(feature, name, dataset, bins_lat, return encoded_feature ########################################################################################### +def encode_categorical_and_integer_features(feature, name, dataset, is_string): + lookup_class = StringLookup if is_string else IntegerLookup + # Create a lookup layer which will turn strings into integer indices + lookup = lookup_class(output_mode="binary") + + # Prepare a Dataset that only yields our feature + feature_ds = dataset.map(lambda x, y: x[name]) + feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1)) + + # Learn the set of possible string values and assign them a fixed integer index + lookup.adapt(feature_ds) + + # Turn the string input into integer indices + encoded_feature = lookup(feature) + return encoded_feature +############################################################################## def encode_string_categorical_feature_categorical(feature_input, name, dataset, vocab): """ Inputs: @@ -796,7 +833,7 @@ def encode_string_categorical_feature_categorical(feature_input, name, dataset, Outputs: ----------- encoded_feature: a keras.Tensor. You can use this tensor in keras models for training. - The Tensor has a shape of (None, 1) - None indicates that it has not been + The Tensor has a shape of (None, 1) - None indicates that it is not batched. When the output_mode = "binary" or "count", the output is in float otherwise it is integer. """ extra_oov = 3 @@ -1076,7 +1113,8 @@ def encode_any_feature_to_embed_categorical(feature_input, name, dataset, vocabu # Learn the set of possible string values and assign them a fixed integer index #lookup.adapt(feature_ds) encoded_feature = lookup(feature_input) - embedding_dims = int(math.sqrt(len(vocabulary))) + #embedding_dims = int(math.sqrt(len(vocabulary))) + embedding_dims = int(max(2, math.log(len(vocabulary), 2))) # Create an embedding layer with the specified dimensions. embedding = tf.keras.layers.Embedding( input_dim=len(vocabulary)+extra_oov, output_dim=embedding_dims @@ -1281,18 +1319,32 @@ def encode_auto_inputs(inputs, CATEGORICAL_FEATURE_NAMES, FLOATS, CATEGORICAL_FE numeric_encoded = [] text_encoded = [] encoded_features = [] - + #### In "auto" model, "wide" part is short. Hence we use "count" with "embedding" flag. for feature_name in inputs: vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name] extra_oov = 3 if feature_name in CATEGORICAL_FEATURE_NAMES: cat_encoded.append('') cat_len = len(vocabulary) - encoded_feature = inputs[feature_name] - encoded_feature = tf.keras.layers.experimental.preprocessing.StringLookup( - vocabulary=vocabulary, mask_token=None, oov_token = '~UNK~')(encoded_feature) - cat_encoded[-1] = tf.keras.layers.experimental.preprocessing.CategoryEncoding( - num_tokens = cat_len + 1)(encoded_feature) + lookup = StringLookup(vocabulary=vocabulary, + mask_token=None, + oov_token = '~UNK~') + if len(vocabulary) > 32: + # Convert the string input values into integer indices. + encoded_feature = inputs[feature_name] + encoded_feature = lookup(encoded_feature) + embedding_dims = int(max(2, math.log(len(vocabulary), 2))) + # Create an embedding layer with the specified dimensions. + embedding = Embedding( + input_dim=len(vocabulary)+extra_oov, output_dim=embedding_dims + ) + # Convert the index values to embedding representations. + encoded_feature = embedding(encoded_feature) + cat_encoded[-1] = Flatten()(encoded_feature) + else: + encoded_feature = inputs[feature_name] + encoded_feature = lookup(encoded_feature) + cat_encoded[-1] = CategoryEncoding(num_tokens = cat_len + 1)(encoded_feature) elif feature_name in FLOATS: ### you just ignore the floats in cross models #### numeric_encoded.append('') @@ -1303,7 +1355,7 @@ def encode_auto_inputs(inputs, CATEGORICAL_FEATURE_NAMES, FLOATS, CATEGORICAL_FE else: cat_encoded.append('') if len(vocabulary) > 100: - print(' ALERT! Excessive feature dimension of %s. Should %s be a float variable?' %( + print(' ALERT! Excessive dimensions in %s. Should integer %s be a float variable?' %( len(vocabulary), feature_name)) use_embedding = True lookup = IntegerLookup( @@ -1333,7 +1385,7 @@ def encode_fast_inputs(inputs, CATEGORICAL_FEATURE_NAMES, FLOATS, CATEGORICAL_FE # Create a lookup to convert string values to an integer indices. # Since we are not using a mask token but expecting some out of vocabulary # (oov) tokens, we set mask_token to None and num_oov_indices to extra_oov. - if len(vocabulary) > 50: + if len(vocabulary) > 32: use_embedding = True lookup = StringLookup( vocabulary=vocabulary, @@ -1346,7 +1398,8 @@ def encode_fast_inputs(inputs, CATEGORICAL_FEATURE_NAMES, FLOATS, CATEGORICAL_FE # Convert the string input values into integer indices. encoded_feature = inputs[feature_name] encoded_feature = lookup(encoded_feature) - embedding_dims = int(math.sqrt(len(vocabulary))) + #embedding_dims = int(math.sqrt(len(vocabulary))) + embedding_dims = int(max(2, math.log(len(vocabulary), 2))) # Create an embedding layer with the specified dimensions. embedding = layers.Embedding( input_dim=len(vocabulary)+extra_oov, output_dim=embedding_dims @@ -1365,7 +1418,7 @@ def encode_fast_inputs(inputs, CATEGORICAL_FEATURE_NAMES, FLOATS, CATEGORICAL_FE encoded_feature = normalizer(inputs[feature_name]) else: if len(vocabulary) > 100: - print(' ALERT! Excessive feature dimension of %s. Should %s be a float variable?' %( + print(' ALERT! Excessive feature dimension in %s. Should %s be a float variable?' %( len(vocabulary), feature_name)) use_embedding = True lookup = IntegerLookup( @@ -1374,7 +1427,7 @@ def encode_fast_inputs(inputs, CATEGORICAL_FEATURE_NAMES, FLOATS, CATEGORICAL_FE num_oov_indices=extra_oov, max_tokens=None, oov_token=-9999, - output_mode="count" if not use_embedding else "binary", + output_mode="count" if use_embedding else "binary", ) # Use the numerical features as-is. encoded_feature = inputs[feature_name] @@ -1407,8 +1460,9 @@ def encode_nlp_inputs(inputs, CATEGORICAL_FEATURES_WITH_VOCABULARY): for feature_name in inputs: vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]['vocab'] extra_oov = 50 - vocab_size = int(math.sqrt(len(vocabulary))) - best_embedding_size = closest(list_embedding_sizes, vocab_size//4000) + #vocab_size = int(math.sqrt(len(vocabulary))) + #best_embedding_size = closest(list_embedding_sizes, vocab_size//4000) + best_embedding_size = int(max(2, math.log(len(vocabulary), 2))) lookup = StringLookup( vocabulary=vocabulary, @@ -1483,7 +1537,7 @@ def encode_num_inputs(inputs, CATEGORICAL_FEATURE_NAMES, FLOATS, CATEGORICAL_FEA #################################################################################################### def encode_all_inputs(inputs, CATEGORICAL_FEATURE_NAMES, FLOATS, CATEGORICAL_FEATURES_WITH_VOCABULARY, use_embedding=False): - + #### This is a new version intended to reduce dimensions ################# encoded_features = [] for feature_name in inputs: vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name] @@ -1492,7 +1546,7 @@ def encode_all_inputs(inputs, CATEGORICAL_FEATURE_NAMES, FLOATS, CATEGORICAL_FEA # Create a lookup to convert string values to an integer indices. # Since we are not using a mask token but expecting some out of vocabulary # (oov) tokens, we set mask_token to None and num_oov_indices to extra_oov. - if len(vocabulary) > 50: + if len(vocabulary) > 32: use_embedding = True lookup = StringLookup( vocabulary=vocabulary, @@ -1505,7 +1559,7 @@ def encode_all_inputs(inputs, CATEGORICAL_FEATURE_NAMES, FLOATS, CATEGORICAL_FEA # Convert the string input values into integer indices. encoded_feature = inputs[feature_name] encoded_feature = lookup(encoded_feature) - embedding_dims = int(math.sqrt(len(vocabulary))) + embedding_dims = int(max(2, math.log(len(vocabulary), 2))) # Create an embedding layer with the specified dimensions. embedding = layers.Embedding( input_dim=len(vocabulary)+extra_oov, output_dim=embedding_dims @@ -1525,8 +1579,24 @@ def encode_all_inputs(inputs, CATEGORICAL_FEATURE_NAMES, FLOATS, CATEGORICAL_FEA encoded_feature = normalizer(inputs[feature_name]) #encoded_feature = inputs[feature_name] encoded_features.append(encoded_feature) + ################### + int_list = [x for x in encoded_features if x.dtype in [np.int8, np.int16, np.int32, np.int64]] + float_list = [ x for x in encoded_features if x.dtype in [np.float32, np.float64]] + if len(int_list) > 0: + all_int_features = layers.concatenate(int_list) + meta_int1 = layers.Dense(32)(all_int_features) + if len(float_list) > 0: + all_float_features = layers.concatenate(float_list) + meta_float1 = layers.Dense(32)(all_float_features) + #### You can add a Dense layer if needed here ########### + if len(int_list) > 0: + if len(float_list) > 0: + all_features = layers.concatenate([meta_int1, meta_float1]) + else: + all_features = layers.concatenate([meta_int1]) + else: + all_features = layers.concatenate([meta_float1]) ##### This is where are float encoded features are combined ### - all_features = layers.concatenate(encoded_features) return all_features ################################################################################ from itertools import combinations diff --git a/deep_autoviml/utilities/__pycache__/utilities.cpython-38.pyc b/deep_autoviml/utilities/__pycache__/utilities.cpython-38.pyc index 14a96970f53e7718c90d51737c69d56aff26aecf..076be1844fe9bd04d8bfa0f350c4fb220f6ee3ab 100644 GIT binary patch delta 3480 zcmYjTdvui55#RZC^JViS1PqV70K;PmF9ifcT1o-|!U-f8Fb`#2c9&$$x4ZG~0tp1e zLrejAWe`raDngIcqUGoZ1`VXzis4j=*q8X#L#u$|6RaYrN9q0D732Q#+qpAyXXf5- z?#w)NNIP&yOIehXl4z5^Q3Kyy`p9!BMjlz_s=R+{bkr;#d5q{Vr?`gGVRND@T}wMI z6~&(bcL84EMfnfV8>W^2CFy?w=T-ju*yDN+$TxZSxbdwgC0Wzg7g*sJb%9WwyU}C# zqfx(j6e_0x2LXSO;7SrlK>y6XTw002u+ML}L!O{tJPUa@Am##c%^R$V^9J3%WnwlY z3g&=%&1U17$1)P^)B?Xro(SY0VEcF|f z!y7gtqJux1^fG;6{&8{&QKxzAwnWPMpVW0Z#V6ROEPM{?65ukwKV=ww%yCm)^sza9 zYP?p~3I3N7T=Ak2^jCnd0r7wQdPdkRj_ z*Su?5Ucal7{W;le5r-sfj&u@NxqM`Q{?D|>bK%)#k2(E8(&C#K<2q;?@ox*W`dyRs zmEQenB(9mS7glSeo5QCY4tkN>i}R_IyNYM)|H8gA`;%sBc^`oM5O54|3qT!CfLaUC zczj7No#F*01GAL(5V&E0I&GY!TxsG6Zz%Z%y}+-OjF8F~OMakq^TXMFv{Xbt-<2Ys z!;a_KQ96VYd3fnI$}|s`?r@OBvlbW`BV_CrHMle@!3c!>D+9i0y%@pW3$|xuNs)@o zazq0%b9Tef2{y`qNq^;oKH|&eW9dBSRLmJP3U(7jc%@sWL0DL1 zi2I>)om(r$&_#ZxVk=!V3l{!Lqg-BHIgCc|X3>N6lb$DLJKTwEpe zq8rQUbeg}b8YfRQy1MWt^>|)@`4T87vvYWLbrH?vuIiC}W=gKSs<^p9Pn}=%@K@E3 zQ4Y7))aJc}aCFNV6`nxI?Td&EscPu~C`MvWy@UJKmQgjkYp>+x!2C8i8V;zyG8vR4 zY~n@0LTKtXlG7ykY4?($^-xKZJ6v(%UeFDIFrX1|ACJ0ch9e?L4u=PG+dbvj zeovWWMK7ypaIDCL8>4}6C?YN(faTbIw~gOw=*Q1{@~Mf>dOFF=k1Sm~dJSZ~<)*jh zG)qB+T|P-onsAb6=9%6lw3_#OUDV2_ypQVZv9*&!zHv0he9SkHvXWZjeZvZEEq&zw z&GJ9K#kowo!{*BKBoX%+P|N=wkZev^Hk|aK*gK4C>o&?4@Lt{6o3?cn=w zx&A`DWfz7J>p3xyrqhxgp7oS6rv0oS<58=7jlHvLW0NZ@u5Q-0%zKRrz;mVjsd#0Vzt| z$I=0=5AM--LHS{h3(X%J zBe8x7&sF=FV!f=0D*ag*NfL8#@#>1eLT^O=xp7 zNjAB(T~ADK+Qk7gcwZY0o+vprPLI!r8X2i^MwSN^C8ly}q)4V`MPw+QG$WBJ$MoKY zu@+H`fm78mUIW!1pjyXiDYfiI*egzghsGj%z(h`Lx{aptjHd1M8Nbz3te=)rFY%$l z1NqkGPiYwEu9_^{)S^`*X$-GfRZD~TomDP6W7=1H$P%>t z$&^kRluF{P`D{x_qhFg@t>4j&=aRi{oc=!IIL{U9M$k%bT31O^_|&>G`rOQ1|F$;E z4k_xam7wZJw7JnAdx&nF#JN1;cP|ZiB9*bvu@>r^B-G`%WxUgK_0n|NpjY#hjl?tB5MgE{_ z*7&0ktpx0a=%yMtPT8!9FPl}laBlG(;*i{BC5PnKZ4`cAz#EN- zZ>4HYBHRc4Q4#Qpj`p_E`ZKU{kSDZFkTufNwukKIH*NDMXCC-p0?;jF14{IGqk$D3 zR9bO?mu@O{bwjGgh)k%Yh&Wk2EnWB{awHR)TnGYO;+{>@Naw`O1@t$bwYip(_{q%+ zDpj(#<9M31Bnw3n_Q!j}!N!2$mvbfT#rnd}jKQheiSwv)iUjEGg&{A`+%iJ5#j&wv z2&M4WE#>-ks9oc8TdH)`A5wY7L%CAsdFV}918&*sNlk@Yz0MP@mxZej@7+2msdr*d z=U2Cm>7NDD9;*-A+j2KY>l`*Ff^QmS2CH=o{CO0$aWa*7>;}nm(R>$)9Ij~JMIV?SwC9ulXQ<}#e>sE4?m8mh z{kdIpTVrDz?(rz#FyLK4H{cC`n*HU3ZWCVsF#d{<0pA0z0x+Ih@sS4AoYbckI=rO= zG6AC{SP22yvZ7%O>}uG$Xls?CQphp#SbgS}H%IHkp>e)Iq|xv+i*ks1;Gr7SG63F` zz>pAoOHGopoW&l)SnBaM2y}I!TIV`Y>j6gqM*%Ma2H?Y@YRcL*QX*=UI0HBfxCp?k cF6>Z5f{GNb*qs|oD;KZWeT8mkW6zNP1G0d~;{X5v delta 3435 zcmYk9d3;pW702hiOkO4%B!neE7Qn>h5m`h*3REPKT_9k95F*25<`IUxS-dlW5F#Ne zL|MXeDEg@tY30+p)$Oq;Td@_e9|i2EwaDv7Z3PAVh~R?MPi^S^-Kj?Y`0{(_o_o%D zZ|*(!=65e?8(-2g=VoT6x#YidyN>wQypm}ZQH|YP^qxlh?aGovL^fv{#bn#J89lVD zL()+k0cwGxJh$v&dfRr&z9#*Tu>6VtI^?i^67m%97&@%=gp@Vef#7me)Ca@$-l)$q z<8f0w0hf1x=Yi)WjC8RF`X}+_qmvej1Wd~t_JvIGdyF3k5-BjQ`9n3y{*X7YL`=qr z!W5`uE*IZDY&;#aA0GB`67A!o)qQyNZD;6Jo_Blvro-^r1)P&`(nF0_Jm|H2b*AOG z{ShlB_VMN0kJ459#gUmr2kb+m(kSms>1%kzXPBoVT!#7rxWXUbF^Dd4@@RuD*~3Ps zXfqDL{*{E0BBIb=1J{5Q;2U5#pyKLXack0Iai55LmxxG6l*8jYerU`z`kqgXIY!rb z``DtKZ>9MQa=Bt|DO~OzB);X!lAiq0*qw#QY$PR;{#R=8?}VWf`i6^tH7+mby42UY z=4X+(ZofTlfkw%8@p#Kkukp5uGCIIVDkkgiVcwai)9==bK7srcI0)PdsFxE^YXIV5 zlM3k_o;4{qPh}6o8UfVXCQD^xi9P(#q@UAXeq++j()r?~PRh0~Ozy6AL-jL_Oz|>i zyu$8j11Ob?r*%+2d(X5jZaT{oXIZ_AW$g|%7@Cu61;gfwU?9F!6!VE$n|t?{L#i^z z9S_Qtvjc(NWUKN{`ZGUYnMZm2R%L$rt1$ltxX4#4htN4Lm_4O$3F4-T$O^Aq1`*+q zC04_ylUrxsLKpaj*_-HsJ!Z~b8Wr%WszFrDJF7-zsZU#q#3)SsfzMZ+*Ht0I`PAGg zJuV`~GXTDU(M`U7>AcePBsh-(#`3~>`Es4EocAbA<)rz4qtE#L`BU|~F!(Vatj^<# zYN1c~;c+~j=KoX=m2Wh7!MI*(^Xx_ViEvP1C-bTW?WW>Y+|hExTqPO{qTVm|y(gP0Gf zUbBxI?pr{neeAv-MCbYaMepkO%R#@!?}l^vxOc+9diZ9^sYbF0LN5ctKm>?#>Ea0_ zc-i6*0xX_EjZ%C2CHhMw8V^RoF>xNb&YPX=-+yL(GYV9+>M zOG=ApiFU8cxXC_cZYF&I90#($el6ATrTQVg*2AzF*a)b8(7|O(KhZ=R_X!s1_rN@Z zCj_lUt09rf$(-mD*2;V1k!V!*UMIcE496m(JmM1pfohE$$FtmLdGQ6D^n?lr;}M~< zIGL8|6Jd01;gcLoz^;`{iDRZ7osZm3YU$q#+`JU29icJo7_$MkLR zVjd8lIV2(R{5o&|I0SSpAazt(vhc(FM0f=q=A1|c9k=I4deWT3a6Jed1yl@%QgIQ; zq3Pn+_&+sf#=Y_`evJ4rb zP{*Kh0CiBDmNqBJiulC|*l>)^_6#fv5_{p_oBEs5f;Z@qztTOIXv} zWwz!K9Bnmhv-KqXuysba4b}gJLe8<(b~7z!ZmXhEe6ei?UA9L(ct)G3VH9oENmV@~ zzA|bi_M;cC;%SVT-nyVKR+aept#E%tqQ0!X_t{g|uZ)-HaWl_ZTb!lt4ByE~VhNmx z+t)75QhWTjFm}LL#y8gX%W8zog)ti-Hr9=x<6OJ0#*ho#C6)ubF!W<}0#|}WU00x1 z@<;3X6u&5aoQ!Z|$Qu=AAn1?B#CLMECQWU7GcJODdEe>ZK3IPilb+$}?ZaiqYiWOs zG`r^`)2Uz@>|X(B`|>17^ZDb! zxPUMC(xYQ3nTywtp}+9r^$RJD53Qe7r55crybZ}nh9=T5KgAyjMT3?pFNlbj=<7cT zhNNmHUP$0Yg$I7Spz7GW;b!fIo7*=GNY0RE|JLlDe00NTy%XNoxpPCcu9|)(*KRDN z3~t$YT5jokHu<_`BD-Gii!YT;ryC#K)Hl8B2As_wZMvmrcO>fCHT~pccP{LhPS?4i zqk#JH`i}g3<)n5Ma;D46kGipWs9TcgwBPKQM)WXu-!eo03?==O>$Z%PpZ@VJd*&qC zI;^+gWz-rdl*&nqT9LSF+?J7=*zxMr|2uruGRDeq()>$tiBy+o75sDFxpj;z>+IHB zsg%>Vl^5n=#$XxKph@zJs~hWs(q_<1gG4@C+qTmuHgOrz8$4wDyYfWp+&;B6aWg|v z`vDvH2cWL!Z$rHU90RTbUjmna?}2ZD>k>{%tU;YL>PHvJs3#lf2N)7gYEYg(@d)nZ z>OOVBl`4m(V!VP+|DspxEJ*b?K{b>K;Cdhgpf=G8sT@y*&$8-#{swjaiPacg1GE8q yfc?O0KyUoE=!1y{bsFjn@BwfExD03-s=-nf8Eow+OsuR@-n!!&jpp{p2K*20UBK1= diff --git a/deep_autoviml/utilities/utilities.py b/deep_autoviml/utilities/utilities.py index ec1a952..f12dc39 100644 --- a/deep_autoviml/utilities/utilities.py +++ b/deep_autoviml/utilities/utilities.py @@ -913,6 +913,7 @@ def get_callbacks(val_mode, val_monitor, patience, learning_rate, save_weights_o callbacks_dict['tensor_board'] = tb callbacks_dict['print'] = pr callbacks_dict['reducer'] = rlr + callbacks_dict['rlr'] = rlr callbacks_dict['decay'] = lr_decay_cb return callbacks_dict, tensorboard_logpath @@ -925,14 +926,14 @@ def get_chosen_callback(callbacks_dict, keras_options): lr_scheduler = callbacks_dict['onecycle2'] elif keras_options['lr_scheduler'] == 'onecycle': lr_scheduler = callbacks_dict['onecycle'] - elif keras_options['lr_scheduler'] == 'reducer': + elif keras_options['lr_scheduler'] in ['reducer', 'rlr']: lr_scheduler = callbacks_dict['reducer'] elif keras_options['lr_scheduler'] == 'decay': lr_scheduler = callbacks_dict['decay'] elif keras_options['lr_scheduler'] == "scheduler": lr_scheduler = callbacks_dict['scheduler'] else: - lr_scheduler = callbacks_dict['scheduler'] + lr_scheduler = callbacks_dict['rlr'] return lr_scheduler ################################################################################################ def get_chosen_callback2(callbacks_dict, keras_options): @@ -948,8 +949,8 @@ def get_chosen_callback2(callbacks_dict, keras_options): elif keras_options['lr_scheduler'] == 'decay': lr_scheduler = callbacks_dict['lr_decay_cb'] else: - lr_scheduler = callbacks_dict['lr_sched'] - keras_options['lr_scheduler'] = "lr_sched" + lr_scheduler = callbacks_dict['rlr'] + keras_options['lr_scheduler'] = "rlr" return lr_scheduler ################################################################################################ import math diff --git a/requirements.txt b/requirements.txt index 4c27ea8..da502f6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,15 +1,17 @@ ipython jupyter -tensorflow==2.5.2 +tensorflow~=2.5 pandas -numpy==1.19.2 +numpy~=1.19.2 matplotlib -scikit-learn>=0.23.1 +scikit-learn>=0.23.1, <=0.24.2 regex storm-tuner>=0.0.8 emoji xlrd tensorflow_hub>=0.12.0 -tensorflow-text==2.5.0 +tensorflow-text~=2.5 optuna -mlflow==1.22.0 +statsmodels +seaborn +scikit-image diff --git a/setup.py b/setup.py index 3d169ce..c75a9e3 100644 --- a/setup.py +++ b/setup.py @@ -15,36 +15,18 @@ ############################################################################################ import setuptools -base_packages = [ - "ipython", - "jupyter", - "tensorflow==2.5.2", - "pandas", - "matplotlib", - "numpy==1.19.2", - "scikit-learn>=0.23.1", - "regex", - "emoji", - "storm-tuner>=0.0.8", - "optuna", - "tensorflow_hub==0.12.0", - "xlrd", - "mlflow==1.22.0", - ] - with open("README.md", "r", encoding="utf-8") as fh: long_description = fh.read() setuptools.setup( name="deep_autoviml", - version="0.0.78.dev2", + version="0.0.79", author="Ram Seshadri", # author_email="author@example.com", description="Automatically Build Deep Learning Models and Pipelines fast!", long_description=long_description, long_description_content_type="text/markdown", license='Apache License 2.0', - license_files=("LICENSE",), url="https://github.com/AutoViML/deep_autoviml", packages = [ "deep_autoviml", @@ -55,10 +37,22 @@ "deep_autoviml.utilities", ], include_package_data=True, - install_requires=base_packages, - extras_require={ - "text": ["tensorflow-text==2.5.0",] - }, + install_requires=[ + "ipython", + "jupyter", + "tensorflow~=2.5", + "pandas", + "matplotlib", + "numpy~=1.19.2", + "scikit-learn>=0.23.1, <=0.24.2", + "regex", + "emoji", + "storm-tuner>=0.0.8", + "optuna", + "tensorflow_hub~=0.12.0", + "tensorflow-text~=2.5", + "xlrd" + ], classifiers=[ "Programming Language :: Python :: 3", "Operating System :: OS Independent",