From 22ea8dde84b5985a8bd43e720182748c04d12d40 Mon Sep 17 00:00:00 2001 From: krfricke Date: Fri, 26 Jun 2020 00:59:54 +0200 Subject: [PATCH] [Tune] Added XGBoost tutorial and template (#9060) * Added XGBoost tutorial and template * XGBoost tutorial: Cut some clutter * Apply suggestions from code review Co-authored-by: Richard Liaw * Added XGboost logo * Fixed further references Co-authored-by: Kai Fricke Co-authored-by: Richard Liaw --- doc/source/images/tune-xgboost-depth.svg | 242 +++++++ doc/source/images/tune-xgboost-ensemble.svg | 680 ++++++++++++++++++++ doc/source/images/tune-xgboost-weight.svg | 241 +++++++ doc/source/images/xgboost_logo.png | Bin 0 -> 22375 bytes doc/source/tune/_tutorials/overview.rst | 7 + doc/source/tune/_tutorials/tune-xgboost.rst | 518 +++++++++++++++ python/ray/tune/examples/xgboost_example.py | 52 +- 7 files changed, 1720 insertions(+), 20 deletions(-) create mode 100644 doc/source/images/tune-xgboost-depth.svg create mode 100644 doc/source/images/tune-xgboost-ensemble.svg create mode 100644 doc/source/images/tune-xgboost-weight.svg create mode 100644 doc/source/images/xgboost_logo.png create mode 100644 doc/source/tune/_tutorials/tune-xgboost.rst diff --git a/doc/source/images/tune-xgboost-depth.svg b/doc/source/images/tune-xgboost-depth.svg new file mode 100644 index 000000000..5e96a4a8f --- /dev/null +++ b/doc/source/images/tune-xgboost-depth.svg @@ -0,0 +1,242 @@ + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/doc/source/images/tune-xgboost-ensemble.svg b/doc/source/images/tune-xgboost-ensemble.svg new file mode 100644 index 000000000..53a029c77 --- /dev/null +++ b/doc/source/images/tune-xgboost-ensemble.svg @@ -0,0 +1,680 @@ + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Data + + + + acc: 0.7 + + + acc: 0.9 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Data + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Data + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Data + + + + + + + + + + diff --git a/doc/source/images/tune-xgboost-weight.svg b/doc/source/images/tune-xgboost-weight.svg new file mode 100644 index 000000000..be342aba7 --- /dev/null +++ b/doc/source/images/tune-xgboost-weight.svg @@ -0,0 +1,241 @@ + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 100 + 4 + 96 + 30 + 66 + + diff --git a/doc/source/images/xgboost_logo.png b/doc/source/images/xgboost_logo.png new file mode 100644 index 0000000000000000000000000000000000000000..70014b9fe536175170e97509e5a9e278a37dfda4 GIT binary patch literal 22375 zcmYg%Wmp_d)AiyOG`JJo-3c1p-95OwdvJGm3m)7pxChs;5Hz^Ee3Se6e!P3JFzoL1 zR99D>I(2$BQb|D)84({51Og#TOMOuRfgmz~<6L-H;CEz@J@5hu!BI-v1q4F+`0oh; z%F4k7fe_`b#Ke@8EFD}OTr3?NNu|ZaNFALW%&lzAKp@YxTvZRPQuVy96M%P>MwkctBb8aWPg6z){7eB(4v_m?0(8g&nEvU zbpRzd{Q(5Rf1K@q;}J?Y*x%jWH$SjFzZoiev$=5iBuI7%e(Q>Xga)B0%$% zIgUX5aO<_H&;_k&fGoj}`t0in1v-eoydR^zcwU5w^9q$l1&9a{k&sR)O$xhNcA^E} z*|#0GqW$lHci+~%FF;ON!kIOF#UZaw#`f;DDscxB(4wsN_Xp%}BcTDWakR64IO_En zu%a%tBHNTc$&Yxot0aCQ8%zCz@i&0&k|gaFA=v{NR2`<+r_ch|JYn1Ce+B^=jiNQ@ z4ifgiW{kl{N(7-kMY9v5TL2_k{uAJ>kwD(dkT)+FgytKr67pzhX~w< ze$47Vl$?H|u09+{W7sdkA1L|+&_rQ(0|^leannTb)cW{Vpqqa{;TNE1ZKE~yv+^Jx zY+>jH9XsN{8sX;s#O(@0fEF2r|7c9c7lCdPxAJKk9w##n`jfE~M^a!3>E8%U!YM>Q7=(llB_8~4a!gpkZ!{5-%0SJh6`L~T{%$o z1JjCZX7QVdKL@rHmCwo^y19bQBkv6`Lr% zF&+m!P59~)v?Qvj&d7+#@+b|cQ9c?bOHrVVCG9F8l2InGNh3+yNUcjTmZ+8p&Y@Z5 z$Wa|9*o|%O!Z!p~lcp!OCZHxLCOwZ0?6zX^4MqHsLnxw9DM`VQ;WQK4gD%Zy6U{8B zRGzBHTY#^O;LF#MwkvQ~Vpa{W_{iawMLCs?EjCfYSiJZv%gVy)-jeyRYX`#b5MuH4 zU+Hr;hj4!n{)XImd_WFE4jGX$p~17k^T8{}ZN+2Glq-XoPtP1xV_9Y+$h62n&2UiH zqPwH>#Y!BENXQ8p~@F?QL?o?O1o4ITD;VJ@)#Ga65jB z^7r?w=G6{_Ce<99W*asF#wJ#5>m`FFL%Th+Qz zE7fBsalDH;HhIQ9t6l3av7x2nIYTnhT+t(GYiYg{Flq8>_thC1f*M{Lw;IdU(o1^B z499j$nzl*=n%N(+`LZWkzSm^e47z9AfieGJ#?#Kx-m2QKv7Wq~Af9BMz@GBn8Q)Rf z+1>o3bjiKu#3_?an7~MPPsdTU zD~~F-&-b24-NQfYm}{QDEza)D-l$o%TlhWsdC0HoS)M7IU`&7D^SQ{)#l6-ar^%h% z^6c`T*9iyA2JQyh2G8BZij6r`c}pyIqw9mZTip{rMtp*w&7P+JLJ1BFLgstthxe5A zd;)*&A@^VM&-90T$$RF1ns|+W=-u|Z!oF01F@ora*n{W}!VhYMvVkIj&V(+74ateAc!UU0RdT?#5via59f;D(Ug6|f+1@bYaBioONZczWs*vccnLi&Yz7{T=I>DU zDED`=1wLX+(l(-v#0oDH=g|+7mX58G=}5Ur)z!)Mo?)qaSNqwbJMJ>RBBq^x-MYou z-J;ssR>juzB)cuJt+Fl1GyFmPG9AP9BL%`~*RxHk zTS{kv9c=z~^hk~rjP2W<@I$HXN8?JHr}pL6r(W?+Z1QMT91kW|eU*-{o>mugeiN1x z4Qa1Rf3#H8udKZ`uHsZe=1NP&=z}#mQY%u%ChTi(blOd^j8ng=pIfId>S_eg;iMW? zJ63r!nWL?qo~#`c)xU@tt>Wfc}KWYfV*a&nng0QtxjMpKFv5 zA}s#R^OTdWW_nLen{IKxTl=}Q;NS81CIlV5`~JV%Bs~;|^5T6;ec{Pg7iKX})_p$>W*0N8tJenPK=um!&?DbQF%Y1!ylnx802Q~(#z?lK9xA4bO zUi5Q(y>8Wv#dZas3%kz`V~TT$42UFqTHpQ8S$9m`MmXa&^9pilK3_iW1}E>7e=X0; zRS^2Tkh$S?FZYsZqpz&rZ6P+3CxqjACf90sKX$!&5U+pH<#_Yy-ejG!Rkr4H?Pp1U z$JcGCI$eijv-^otO@Uh18~J1Y+4K2;C+ogNm*t0LCjES`?(^Xjf>wK~--K)+jEkW zl8fXL2#y{udyJx0&;)_PT3 zJD*oo)bWjwkC1dtdwsqvd#RaT_d9Pp?*NZbt^7d-HNdttu+ zcKu8_m(LEcO;S@S6?qWIn-T;H3;}^2-+{k>K_E945a`GV1mXkeDUL&u;gAFfB$p!n zMMTYW?IhR3Q+L_#UI;E3T12Jf3nFHPh;oDNm{`jmdi-ep-ieuoI|cJXqMCJfkKJ6^ zm~N}QMaLYs6#=*3I;1oo5w$(Wh`b5)Wp%mjsk9R2J(Wl>F0|wM^T_8PmiVO1XYD0Y zfBo~};xXm~cn&sP0>Ibn!C1ks&v$;r53Y%ZudC_jz3;6~KYqRW!8~Xs8v4CV+=Jgy zr%`wvw+i&W6wi7Eoa>0#tg4@cW2L6<%vv@v;m3OCj(lc5W4aq?F8q~f$)Q6KEAe|8 z8#tp+p_(GNy=n!O+jb=Z#0BYp;FPvta1^-IE`%d0=djUMj}8xkBEs*uwKYec0a$TL zd*>2A!ro)ONC%VIZ zgTp%=J-ts51q<@mTs!BjzJ5wcmQwpI6=AACT1nwFMHY2+b^W9q)on*DkCw3@D7bXO z{=*J`{rddnS{*kP47p+SgzGbKcGPQ^)9G4qk`dc96JY&T(tkGTqgH$W9@MXO_4D(a zL{C6E1Lt9^8HR6gTf7tUI6975vxrZb`uFytNHg9TPK58WtIwM1>WgIejN522P;e+r(!uDWwzOoVrnm@c)5yMWuk=T~SC78r z&gU4y)2p;_D1>^>(8b5zKuiY-7w95Njfi}Hf5W;p9D6AYejgtr`RCqJah3+*o#r8b zO?}n*pq+sFu7KOyTcWtTgT9~;*)$`W)bPx`0{2e^``})cM`nfKQsB~Q(0SrrA`Y;E zAU)Ith(`8tRM5_qi_y{yitppw^T^)213lN73IBJCDW93@b|}|TUP1F_k>xkPXT)bG zzr=@&14Rsi8Ogj*eL%=Y?pUSyH(FS<9&AR2ADSFKdTQ7(G6GZ@&}Fq zMHy|klV)z9q_A)st|puNo&AF0m+E*psr(nyq2c$d*ZbB~w2b^R&*hQ!L)UgalK1uA z=T4HWjpX>TdhYCp8N)==-tSk0{$B5IBv=zct4&t(LR_`_&ny?dJcPtQE_{Wk?hm~~ z51w%^Pz`(D-=L=H5BvJZheU}O?dFY2gx|ZQAwH9E%}_^7@qG_P^EiGw4mfr1R0Q%! zf|M8~oZa~>KMh*R!Z|LRL;ks++W%Hl$m#d(^1FV8FjPgo?)6Iz%#O1$ma&jVUId821E`@a7^r@btPM`4myBqm&q-gf=sp4)|YH<6ugiyFjjAB#dGI}lhh zH2jaH(kHV+R5-MxNH9!OUlOn=GuGDDqzj&IDpZ#tSgUqbgy~YB1e5@H&~NVYcOz!;RPX^z^uW{P;0I zA(4#^NYT2=%A@M_9cc);;T`_{Y+bJG?!v07<1=IeE?)~Exk&n1$QP$taX^oHwa(t& z-q*gS2hQ!;PtVWbUkeL26lNJs-_UfUGU4cZn$YL&FvQ%if!?R8~FL1HXs2&BDWaBpqFs_IloNQe)Id{qX2*(zjusdvG{>)YGx>}+%Q z=g++#7P=%JG&J15Wo6|hUQfwDWWW*Q;p4B(&j0pv5D}ghH?cl?4M!zf1HE`5)ch58 zk)2@gH*!=v6OlLz>8xw@&@?mz`*6xvG2FE#Up|CqXJw&?7c0-pFUnAU2QW+F!?l)o zPrZ5O(9u5G^N#%S(r0~5&1HU@-Ui%Mh#SD}pVw|v-ou;Qg@rZ!Mn7AXl2JB&1xy~Alu#J`#IniC-*r>5ua|NczjG67im)wq^6^njkGhD6S zBz(y5o{D$90+kYh;nJ@6H#VEgPpp=fa+!@O-^H2B3)25Fr#sYQapI8C$!=MWl}_dJ7nL+ikr#h?#7KkuSRdkU2d!xW0fjmeH) zs#3B15b5h5a6TMM&8X7ZG|q2z~-GIiPDzno$$q7nK-OT_TdCeGk}I~FSPBN^*J zAR7hHKET55WqI#ed^yw9ILj0~#@J5E*UYbfd8JVxA^A4}S`+^#SK7u%igeZjeScV1 zZFUCy*pA_mk!9>>eO?$Q+0esj%JkFq)_~s!*Rj%~e-pm)(8)`!ReQ;UobqSxKX2(H z#r_6T;o=FIz#v1s!Gh9B-ss!jk;`!2m8RN@2UqrQZjb8B%*5;y zm0*2qp?qd=mK(Jj=Raa=lt^d)lU<5m+gb=*y9~28GtWMTZ36k zS8EsDb#6%siMLJYb^+?hN+@!b8j7*7fo)$7{Mb&t+VUVj%CMT0)_0h~0jE(7Or|dekKv|v_UIrWHe0v+0+yLAxH(|9*!v-vhCj(+q<}RnfO>mVf|%&M@I)rdzz^VdA2hK1rd*H|%!8*)9I<=xJOzH-AK% z)*-{;?R}KlQXC`bp>!VGb6!roJ&Y;l$u3TN5@2k1 z=##$O8=N*vT@=mf#7261OM;3)67w)j(XSil_xCebWQ47vOc-|8FSTge}PwRMHXj57!#*L9g`qpIZ~&WuEMG zqGMM#HeMJcSjPpi;T-|q8w6Dat$GuttE%cEhxjHKL&Jb|Elqn`oCBFIK9WFHq%0kr z<_BNuz0vvm?3*^DX z#t=tz5%&fFjD*8dm4q6z1|{w+hHw(4KtO%rh#UJh<2B5DM185LhROEMXjV2;-mR{w ziJF1ener}9rA0+^+DoUvvbcDva!il{J_ihiYq4PwZ^i*ww*i3cOkAeeSJFco08)j3hhfWVz8&+DP!}m zm205S@bjz2Y)|uUVM?bbs;)gtU9*})kaY(bT@jjGMYdDv_HsrF_Xb|7Gy`duJWdC zLSaCcqy323W$m5W@Hb4C?rB0UNTO3TM_tND1lcC9J)PnF&;9d={Is9oRW@*HrKK5c z>-uLi373CM1~mb#dKAV#aZ|+5hJ&^_BDcA93h%^}4J|-C$7V8gMOqo?@v^8wMQ92Q%2fEG_)VN=FMZ<+sU ztrk^=S4kFCXuR3Ss4fv4yII|`iNG6NzzahIsy&a#-s-T$DD(3MVRpTrt`qOZ@qF zw7hPmvpBOWfCDfcJp4QL(TWAOf0uRH%!#Re0RlDUQ@V3#t0*dFBgGF32#gCIZI@zg~Clg17;(qmdqG!dHl%7!eg{Zlfj zUlu7*HuLB683v@gGLqgjBB>JD4;5(BN`bh@!*;KO7SoZqIF(-6()k);$}Kkobii{A zX|Nxep9VZ%xcWjK({)WP9AHgLvE96-n)G|)Sj-uVo7*LqB?W#rgC;dl^D0pK1*)xknTUESxT zabyF;zeQ`ms?jVKA=5i${)s>}>PP7zP*C|q`1saw62%HEf&WSc zHE!MVvrGjzHZIQpYGEwdAzq^ROhsn`=t~SB1%>WCS&rJ09WYo zT}<9j5b?VU+|5NZ-&tj~)J9y?M2+mcUjhHW{Q<7n6TfW;F)(rkA@iV;$EpKQ^k9X()OKR4Vs>;eYD{Jec9+mf? z$vbd&LFKM;GmcvDKTE0mzv>KA$rhW|J1WmkZ(_jqDl`PQ4 zz2@iTAI5F(ebY_+4IvR)_Uu^P(C<7YftvY zKfrK3jnUA-(vk<_2KB=SqXA@p00 zPMbBhR&H#`K1z-wIJJmEg1B5q^}h3FWgfP7?vwRGC(wqxun9W=PQ9dH#_x6CzaQ3pfPTWq$B%;x=|``Ki~;3du|t+Pv|SGS1la`Uc5(T7 zEleW64QJawu}ZW~tp~<7a8FO+z^O_Ys>dpFV;>s%h%# z{k;vG8=~Ky^YYvvZV-RqR<&VlDu1gZPqk$6EHqlx_wS<)n_XTd?d^||vx$jS$z!`u zYIMy^|LZX|^OHw*@b104ckEmF_3_|u)E{1V00rq4X!&BUOW)vFCvz3X{;|K{OdNJ$ zl7zb2S_L&t&5>PXF!K~CF)fYDRWD=8*m-)^1c(I>CENuxYl-QlOS08GmSc*mupK(u z&p7rqWmyAr)Hq|C3X44b8)1NG9QQMS*zUa!GO43r!8j+{ zl?ms{3E3}%JWzwL{_k5I`};0ec$!AL{9b!KNRO{x?O*ykPT21dkho^z-!N_GlQ<+w12dt`_cLshaLMXkyOvNn$lu+`w)N3|F<-BJ zoms_?L3HJP59t!jup#1+= z!;OGM`j5-~tJFZxg9|Lk|2_Cmsr#2~!~ahFKL^k|{|!YL{EyiDL(*>})BU1}8R5%! z)b)^lMh?2ZtT;qbgV0J?3QiW307;VID_$#osLza?+kd-WS#51+@I0wGs z$AU4H6Z(C;F-b2YaCa?~xjO(K48am(>4S}7dy#45b1vHE9=tA&)=Irk?lb)Ufpm)i zfU#eKL( za;&9hr4xuZYI2K^Hu($^=v2e`H?`n;^n4seH-C-Z*Bo=#%%&I#HV-H;P$??J(Kh)5 zg;ekc!Z(Sq%irLOsTBxA?uH)VN$X5X>>2ov4&8b?C)mHef%k(0!#N^QPV5L=2Z0(yFBXEnxSJQbqx=b%`$;$G z9m2c5p4@~6;~LU%Y2b zxMdLt2lJsWda!2=oAeqAU{^1MtuZh9s=;OV_#rPq71ka z$`?zkH3?;0LPb&s*Jgl4J1(p3%SOkXLD+{{?iSVC!R|zybKh1Tu9GHzcSqJwn316@ zI-9!U%YB<%s%MKsZN}!-qqNy$d^b<@5UQE(~PR3c%#kf?Bf9xa14mu!95J zKQxBL+`>QA3qcXlRv>gPvR9ZYG36VySch89ef?ftcK{ zAER=Nq(|8`EN~o2lm*zB=oP;a+-}&)&WI(WTVqM$@0!X2U970j!RUW zjobFfq%}oh#Py=`s9O+QmAHz@wV8oB*iwl6mLEbo& zNz6@8#2xwptDZe;f<1(=KCIN0wLwp38nd+?(NX*!-RE5uKqxHhuax8YD^%m6GY6_{ zYHfedZX0iFJk^(m+2Ri_3rq}z)yMForPQ}i`{-CR$2LMR&_t@4A<^(M(2eRbsHR$J z-|%?bw3G$eId}7v@JJ)8I-(bAv?jF?#7)j&f6}>l#veW{Sz|!u+;_@l+;&P$?NQSm z9kR7w?$%-Wg;Tc6K_Jx{7J&U(yCPrJy68h%5uOM30P%(N7kV0V0OZQg3%r}aDV3AS6v%DB9)m^+UYUz~E(n40q=&~X9; z6L5O(p-8?Oo<+p?LW(3J`bS{uywNPzS1IlW&Co1t?PvqnE@8Hl8|q9l+?!pSqa~-B{9HL9|_q2wjV+o>k=+3&WSySt6b| zN|Cp@MfYedOb(1lOH?k!;IxB3PTDcsy@XmYAF?=+0Q4X%q}9RJk;)nPZjdzit-@Cs z0d{mL;W-kqKW6K9gmT-j_{c-bzxNLWsnWe-H@+5(m!gr+!Qdga*%2j*JCUuAcgb#o z_0jbQ4(R&K%Lxn{#B*0*!UWW$rI~9;lbXvqq#z{nh0cpQT|p0z9N{UJ#oG{X^#S-) zo8gN(yrEWa!kHt1pUpw87RVLo<1F#UVz2ccv(JXDQE_5>FY*|@0pE<|!wq^bNpF{t zw`>|r&&w}3WP&SzTG=lXCfuuDimzUU>md1LIBC#^6KnKy`^R13=yiN<>6K!2T;i}g zUa^x6f!K&Vya&V!JEEUKE&1CY*uK2_9Y zVP|web*fO7lTvRM=D>$hVSQsniC~1$fvLkV3Y=az}S~Z=iPza;p<>o{n5}g zI1Jp(=dvjMwe5gahG9MyIp=xvCaORX9#?{s9a?d?J2~-dP|xYvh?QsfkzSe8qKV7u zCjcMtCG7FjT;?Gy$FPUJX$IaKR`5SKe56+BDFCnK|1^8#@=`E#Q_seI^5?+Ur(v)c zLTsn9$;9)m<9Um~EIY-OPPIx#trNDdI$7&n{3ME1e(R{=$4itjI8 z+HbePcb~!?u^v75Em?`Bxv+41YC&ih33VHzqr8vYpOYr9sP07C`^*x2iy|YLwkRZS z@>T~JcD?DKpDJ6UlDl_0gghg%e$!oN_V0~I(YbsYBdq?aS4?o;N?A?*n{jWO!fY3U z$sI$xENg?E*7v-Xua7iUm*@O&CJoJ2SJ{$uUu*`z9REH^`xY>zau|4zd?iC_Ri<)> z{Lg3r?C2n6y{pNBAVFg5lU`)|)Z1$CH`3y3J(s4OpCilxdC+B(C!uh!X@sIZ} zw)A83Bv^3o1G0i~+i=$D*I7B?S_~p_spxkxRl~9S>A@CE{zx$Ef7IoOI>cSOpg%|%3rcy(PoE7O+E&B>=q5;q^46LD z(zZ~dV3IMUvWL0U^mo)h;X=7v~rspmiPqZF6s*l%x!vgwY`wU}bPU*T3PDt=xNVgb(14U2kD^ zY#J5YKzQj{Y_Fl>_XgrlfBIQ}oUWfFYH%&ku-PKs^`^@~FFRoD3fjh$Y z087xnB}!S!Th?xK{c!^4@YUQBR?$DD;H_v=kIZix^x__g7hoX3Sx*5S!T_$DBPo-u z>$c9ueK6p_cf+#30$dWWJUXZ18V~pR9b)z^R>oX{)tAtM^{-7vaW@TrK=)xgY9#x+ zwz&5fYo?LFH$dDs-V+{+g#QuenohuIwO6Jl6Vk)rtn>E~dJN|(fP(-=6-%P^SG-hV zXM+1~lFk)=4D@EF`Kx^gEyi;s$~&yDg^|Ph4I6q7&K|G`E7)5HfgVu&leC~Zi54zM zU6~h;d4Ls?N>n}I1X2TJ9XXY^`yK>t@|l8q5Dh3ge-hOWFwrBuDg7U9dAfKHUW9}>=M8K?anwbkcUf`IRED&(TiRzyX$VqXy3saK`y@{?b=cp!ZsjTSpC|{FNB1c!$i>{XUkCbT*e3#PFTc zL)hBcG}_?pXWZ56zymisJq{UW;eNlCh)A`XIQX#&7{UTe|3Z2hC)HL%(t#4D{C^QfmSflbM6Q1I^;)XsX&ZxZgO}kfC*5f}Bml)+q|vZW z<3@SSE!r^-5{wfi*xv&!Ld)VD>f(8Gs;*FNqfLHF(V^6%3R6@zx7^R+ZGuz;1;8yZ zCmL1CZ$D+9&%S^=$-@cK0`-_WILA?%0ziac6J>-_+c(Q(t~`Jn2G&eIM~NnCHNxtq zhcL9Md_Xq+avN3Ju}zbDK0_u@Zq+4f%@QDD#lJ5qD(BOC=*eRFz{kh>LX^C>nWp~( zM?3(k52XxchHXA~-~xgXj<$4<8B8jElU)_JN zF{9aQ3T3EB<(Or;I5Y*zv!2mA2)3Dd`%ng-ZONM$qJDD zQ`dBv7BE}GiWymxyxVZ{#Pj&8Q5WxIkcH$RO0Vop59NA8xG3a0h_9WJpD^_5RpVVy z!blU_cx*{WUhtC4=|#8EmqA;@?Aq*PImLtXFIpX1fx85?LB)Sbsdv9&lbO6^+Gar=y(!&}0A!OtPH{ zufMFJz1`%rJ1nM4qn62=6m3b$L#(N63N~==`Zp{fVHfmjy8i2-60ZHz+pvO&qXvEX zx(wYE2&6q~Evk{oO>8YGZI}U7L5DqQlEIX7b=Dh$d}S*jG58OeQTOeHlD~JnWq_Um z2$pr0PSpQ?1|pzAFJGM-E1ys1I?Y7k58K_niT(~4Vwq{#t(oOF?aVf&3#5YH#Grsi zWLjF}+(#G;J+@$=v08&IBhft)2ag{n^B?k4hI;XvH3>2WbN}U9$@YjWJlGXGEPmOY zHiJQZVyrx{OnWc2H5?(_#)0fXrBPS%uR8SOk@ChJ@Z}9g_{YmTk+au1SNsRW)P(B7 z1rgM=S;_UQZBl{|xA(egHP9W>&qt0$A7Gvu;1%u3e*ewd3(!^xhA8$8pwxL+@dJC$ z9c(fwylG|eP8VdD{t5O`JHQl{HITd~<1I(mFq)pQwdA@9urhvPkf8bT_l?;F`kUlt zHEQu8>e|pCQYViGJUMqI!N>i2C5k0@fl;<79=cE_x=n5ob=h{8?*CxZSH(wrI=#)8 zU_e~#0jR{XVkZVD*C{$0g-Uj5NRC=56Zh$4qpBp>)IHmx^t+uo#a&Aybs;F@$kj~b zP8+dYv@k1Ccip4w^EH)jq}?_tWNuPWI^C5NAmK!qFp1c37+*4fU53v|# zjRi!4Y&3sfk8f3aX@DKi*9Rkehb;9humL6z0I|}Rv)mXb7oUHEIpi`}yOd6IZ@GQ% z*`@UTZc#Y5lnQt6>ec+xe)hIzkph;o9#e-z7keKlQqI3&$y&r>2>V&$ksE*NUR1h| z(Qi5267_~EV|6N#I*!#Rx*oh^gue}hYJC4{cO;4tHRuoP*{Og1U3kKlVo`ZyG9%xY zDz?)E^F51+PS*mhh2!%v*$=pfbbv}(l?luZbpYtIPnW~5UQNY6C)YtO<=&8_=M9^# zDcR=SCSbLJWwnutr-wD)7J?wqPr)?rB)+2r+Ent< zpS>;d(!yiT?}N_^Qz+b{`63*iu&jOg`?oGK#WdAt+f32+zCGtYuCn;|YMvMQx>-l2 zFy#h|{xw%Nol_+D$-*?f7xysYUTs(6yM+9HmfZba`CqC11fD8j--%?7lQBBPZGSp< z8Dz(ZKa4C)shpdomEBE=0*#WcuWhN@{=j^+{r7cvn!ZQQD7}P z{WfHe>t%{_P$!(Jtfzt=oUE(;?9ln;hkOjAA zEFT8PSLT4T@dEa{{~V&02**xvQu5?isgS0NW;@rVvDloBSL| ztx(_|n8ite_gNg_vd>AMPZ+y% znGA}_U|90$-i|tJ)-!FAc)do{V(vJS;@=)HO3JbgMQJ69XX|J~@<(1V{u zdlogdHCZoe~K1kpFbq02ZPFhE>Bsg`UOqr&I=^%^Ft*!ZuTCzSbR}_cJSt%GXR|vCjUaxz zOYq<8z+{?$z+;qH>nri4?vTTXVS4Xeo^%@fPrtkuC-M68t%tE{^;dOOpiQQLBYAH4 zF7k~Wo_QS1wnQ&)ou{gynEO{wRk0)^6>KT{W|Va1B^PbQ4uax@%?yD_q$x=tYgLMa zPA+@~R~;OTFLJs*-#ZEZbo^PJ&x9Fi2j{P2#>Duw156#~uX*NC=(EF;!05i7CNu%; z3?SfmKl9Cj<32X&*)RsRiXa*~um-oUEFVgME3On;+3*Kgdh>@5!70t8NJw^LE|{!4 zvOgIUlxDH_onvlnPs@1cYB5jgLJ-S#T?q1Lve08C4v3C}2b)@1G3$(M7$PUIgW9uB z4nH3Z$gZNE7$p?#KW*X;Y&{a7Zt3~)zO zW!6bNXyPY#WI1+mNymW-(5BZavD&rGmhZxtE!0NaBK(|E^|#MZVOo)3Mc?sNDNpj* z4#uqPRO~&4a!-sl{)YL};tMD@HUHbx>ZN8S%8$6O>0O-EKYn$|tSBitZxUqUIxt@= z{hme~`0(=`O7#9M)qkFuxUgNa@_l|UL6!!iYB5HmFU^U)Sh2>mNYh%OoR)o@l@R6N zc$84X*8-6%V8&U0ht52XVF^D-BemXeL6=Fg{%X)fL{h8P{!DDf-u$6j-$vIt7!VZG z&Cm^7TqY3&ZuP70q=Hy`;J16zD^Pxj3?iXUsjy1=p~G~LFgA`ww6N+UO~yf{;~O3?LZr%4 zl*`>NS~e!PHhA=qx2iQ*i2{3+9J>b1kUf(L$9w#wu%WnbFH{11c1e8C6!mz0IaM`X zHax!4_nb*0|2CN@nfqCgA zRFWgJsyu8x_U#J|25`IdveAFqPKvgCdlKp(F@MGUXGErKV$lRYABjr~j0`abD zvC^8pi(Qq7oz+*j*R}kSFR17^zDj$YOw>*k!seLdPoruyp0p*7^x6;thmU(b!>)LH zYUA3)FTYcA52Qk>I3L8ezNk{TyXsTiwJTA$FRJ8Q1o?N~_5u|07hcq|fNl@;M?j*` z+GPW?M?{5qPfXz|lehGTY*z9OTeFGb7LMW3w+0X0^tR1tTIaI(e2)QQ2i5!g16bG( z5C)hp2+!0X^Nt6%CU6V$p=hW;NH?&l?Vhu9!$}6&rGB*4%5Swcmrd~QmKU9U+htNG z3%~}^wX6sipeIz3*#iwn?BoZH#LU6rEtnY$hPC)@j|%OR^P`e|WbXcuS<3}8%~X@Z zgcx;UWcJWG7Iy5rnVMBJDt$!x$(u2mnlhABA{eGRQYTx^%tSUroy7iA#qA2hp=FkD z%q6R85~Oh!*X&RGp8>5joMrOc(2{=DfSX1yxs+giYX3a07wHbo6XG>|>G&{&GeXRT z=BtX1Da7KgVI;FiS{O?D-wUg)dHKW@5eUP@tz#&kbqSp9$%4cf@ba6d1WacEWgxneEuc>T@{7)o8KX+bsf{c^pnE5<)3sA6&- zM$hRF!k(?|%2|VNXO0N$z0;n!_tETc8$bFYUF-ng7U7^Cj`g=N=(h(C-&my^r{N8T zU5PUd^;~j(9XUV*%Nobl$(TqGpBD3$Ga54$pzyn`&~C-+emZ?W6nPgt*sM7?!3V>% zR&7)+C-cZg7G+H&83C<*cONtJA{60pt5rl6a;#ilW3-X0RxzIhQ3X1KL|X-MR_G5A zx#4kMwMrPKsxfL&3I8vsd+Uctp-_Ie6p-g=BQs46XQRDG+4l$NG8SH#@2R*$kdjeW z&X8wHcmW@iLj{VR_$N`lijRWLpfe*Ubgke1IkYt*?hWTA>>PE=hg$Xn5hU**wR42u zG5M)sCFQ-3RlM+3vohI>i5jwEFXankszf96qS%JqQw~KF zdq$nh4IeTrc}B4}zu!S^k@2yJ^=8>sOxZ>CkmZJ&vK-IR4_UJpJClbJ%&vL&3Jfs7^pTcPow3reh zx`6pjpRwKZb0pcc=DNBBic@nit@wJMWxnrzyR21gBUXvcU%^?$3nayc{YE|EU3A^y z3&B^j*RWPVGQp`%Z~?TSG7~#($qreJ2@0+jC&Ox*h8d`|q<8>B0u!aJ5xUj%5Ee`f z+ZyJ0QZ225@^TNTn$a3?$E=wl<7eIoOB84wP2%2% zo%++8&#J~v!!0p*#rINypLi94##o(ubOZxG6EES}c4C4cYEU#w)ML@Rq1fQcYC1iw zmtu+M{spA!fJ5Em&rhyAehssRf^7YC5%A3Fx9+WN*3Qs2o66*WoGBPR%WZ*Aki2NO z{4bt60}WfB?mO|`Wz|FIm+)2? z4;p$LM+~}G5>HI_@H|4Npnf*OH6NmZ%eEwF%WS6fHK_Ihrjfa)_YH7+$n2=Yt4d z0|Dz*#;z0TLSOi-4eyytLn!t8XcMESs|NFjC?w#$nf^x_nCu6-E@R+PH@am% zOdtNsQ}`NCNIIJJz!mEu=>-kOZX2%>!8DiY)RR?Ox2t^TtdQuGU5T&-xswoFI>tHznF@Tobz*tWaVR9*~u@uSVFrwd10WV)p|9^j6y!7Y$ z#EHMUfr)2>8^MGkvA1Dlt6IV~rjOquI$pxC zqsP4DyQ|O+SsgEn@Ez1!Zo%fVG1fT>&2ECx6{ERiQdPnKvY=2YJpMj$ z{d<~P89M59r+N`DvuNtMj?jNvrY?If5`Bo@>Z zt5Ua$@za#ye^D)Pxt5v!N@gY(Lk)MM?^f-uxr*$lyuE35Yq6DNQ&^88oc-LZHTaOq z7*~j~WAak*n#OmR!WvDGj1FX?(de*zHwJ(;ZnU-6(%pNl=zdm11kbQ`>q&j}6)}R_ zhVV7oO)%5AYcO056>=U6-C!-hn=a6|F!ZW}-|n?|6c2+N_;<;+mheu|{a-bthB10a zbYgU)%8rt7vLI^z07$k5N1s@={o%S)$Q^ zR5TjR$Z`5GN>HtKp|9D-K<_p&Z~&+pq&9R+U%TTttFG?Bzuu0+oI`ji5i(yy2K)1{ z)9QDvCOf=X7e6~Rn9l*0O*PjMf9V>!kI4rZ1I7y4##LhUnNxa|jzy_9f_HsCt2Q+; z(Wyc(ZuC(NVfnzPaaHzEzqW-j+x%Us$&SsWyM)nT%xr;6j|MlYeCGzH3MM(*>Kw_r zHRVCM+{Y%P4|ypcf|Ah8;uc==N`AOdBvz>_g#QCvM&g*C>;y)VYw{9YqZOjbNUbK0 z(c>l{$+#9+y_R_QS`j~WC;YIJHLZ{88{Zxypwt0Z7<=e9DW7*@qSZtx$Z+oTB3=Bf z+tfpm*qZ^%8L+F#j=o!*cm$&!RSf8#y}(_Z8wsGrRG!ljeeG^cHj^p^FZzJ$CLh%s zuLHa`g-s*S->60~$<8uO2+9R6+r&s&C27jQeJDl?`p{>#FjOGGnIbQVlIdWQKqZQP zI2U|Mhjl9RxIqlOC6Dfutr6)$%n(?WH=li!6|EJ$HhnEVf1`Z(ydd4Cod{pWMP=!J!`mCW<)XwII1?)ee2dl^FHV{MH|K z{-o+A58-QlSl|90?1^f-UZv-Q2R+xuP>izuFV@jQYIaDpEd&#a#I|7c8B*2?uMp!$ zr`j^5Q7l~~2a5y{@4u>bD z73TB=fkP@|t|C`xGq4~*>Y!8*bPdU&QkwgPGLhW}+-da-*OBRan>hJn$@(#Q7_!A4 z+*;B~1T&4h_Ao8istER4anaB05S1r^v(ZmB&ig|A$)9W?j`pYSBshD7y z=%kF2G$sVnPK&f=gkIOF%YSI2=xWN8h+sZ7tamN<=gkc{ewyeOGc!zLyuCU_z1DU6Nod z!Hb@IpE&bPH`+MR0dz|uwbFZzVa0z33ow_=oyvE*;s#>9(usM;A^H9w*8 zp@@Fntt!44RsV}Da9J=1ElGjPRxxf%fy+=N76?URzXLpnVq#zcyBK@kN3LMzDC!jU zqjjo$$2gv*hxP6EP^RV?ip2f|*r&S2H!$*~m;8jhVaJP+=mlKkpDxuJmkMz_O6v)c z?4fi-XkUALzG!@$$&2vMa@gA~B&MZc&*-x!LrpRaygR z$@PqNW6}Q-io_a2k=R>9k=Pe8e06__O_j3!XqnqY=l@9UrO*`ab!aB}q754CCm+$T zxY>*b8;ZoLLy_3mfX}FYZ!NhAd9xpu!Z)?d*7(~?vBpp&_Q_Br_U=$5)+SAIi-J%j z_Uk&lM`eu5#K6 zHZU6`|9l+J9ag`$AKzsOD%J)`cUx4OXX?p5I)!_u9UHELCyDDeWHDHvD*aTdmD^v^?pl_YkV9ey7DVgL_xGH~7zxB`Zkk2FXp-5SQ zfmJMGQ@q<4dA!ExHrfx2mrBUJM;z^XS-)*8i?F=+Sw!Gz);x4Xf5RW#9y#)*r}kbo zUF=2*{0#8>z(zaP_!iOo-|}YPi_w24cd4y06p3vG{&MzILy=fJa9Vx})|4)wdth%{ zSymJ2m%c-irBad3T5=ODMwy!hEeu}t)mcRsZFLcqXsP#!lMe~>G5Iii%m}z}IIQ0pIrMC=!i6=D@E+8gZ@o>#jzy!Bk6n9+hjz%radCF%rzTVN}&^}nA6iWoC6_K|2;B0=`WwEThetw@Snn)rPn}%f8VqyvI^b6+vs(Uw1S3O1Kc}*Y zx6%9XY7ztTp`8V$%nJG&#K3bW^euTE>pOXl>YHwZf0gQFSx)9~IIKsb(bxff^%H~m z-oy$#;tu&y_<>8CK6O|M$ua=Bs;gWs2p&wAyhCo$iJCmaszcZ<&N`OxDlWsPs4 z|9g#0_DKTmVS#EE7wa0O-|~)?u%4@!Jh+y4pS}?qNE|#NpYROIBBThUl)yxh_GUaYcHh}h$%EpZ0Y3^cm8`i^>vv-SAPi`Rn0>cYsvI1XkCov^qH7?ex>YTdht;0Phy(9tx5k z!N}I{TFri574Zt5M-V~qPLElMfhUj2cAFUBZRgJ4N-^1911Cj;%^dX9=M+P11pbUrq5zTN||afh7AQSmjgWL`M1&k@cYG) zhXwj5a0a7LVOhRFiNyQF(feD-yt3TZ*+6#a7sTOcwbAMvD|m>O>rtx*HUkWNKpg(d>xe(Hk_7E$F;?g%nKJV% zTncR%mJ6Dg>Hj5h{O{%Ma2Pm=5jJZk2dfxzZnVLB#p%!1lj~bfLh57U&_CB2b)`Dx zItQIKamz*VLJ3xx1dlqMsDmhaQ1`Z@Soky;MZI>)Ye*T zSjnbe$^Ms0wA7sp{qR@Bi~rEV*g1?Sqa(mEDU@bZp8q-~j(k)c|5P2h;S0UitC%|Y zaq;r(Tm4b_?qx3ZHM$gX_ZAyhGIi)<;^k-+>B5>$v2ZB)>#&OH$*YL}_2Qezvg4HL ze|ZhTgd(vUfj?EfWxQFOegeamSSVzvT;nDj4(n^kC=*OCmDizAHb%}RkCAkB`Yv%g z@)6PgXANWrS;i>0!5FN}2(k*a%*~8H{Yi1)i*FXEo^YcLOOx0uzzK|qv8>`*%mE`I z;xGQ9IP%F>QYS9dTHj6YLq8)9e#=X)5VUYalBrV7w~J+sM*hq4yZqck&o_To9Q>x2 z{HR>hF|4~0()Wm?|FMPSqDHvZkSX-nDeFS*lH^i{;diktczx(I?-!@P=`spgo|QKY z+nP(=DLP+rDON}w#J6RV$|e-b-OqjjffHBI|GiI$7eBk3kw+?xo+8Uo6;E_xggu&S zFlI-A%Oy62^}U_`?|w$S@R#ouN53C5+8=iUhk%#mOB}ZnWKse%KyN*X7d|Eq+_#6p z2P@FhOSRtZB#wSU9Qs=sraL6b^ROM{axtxO2Z_(MlKMsfZR$dwQ8&7F0~62vhIsi) zH;J5TOA{haKEBA-?m${AGTOF<~1Y}jYda+887+iw}}(uyYc_{ zNxgN?5mxOQ#k=N09_A)ydN+~ixI&Da+RSi=)Lu*%xXdG1uJEDHG?VV$!sO|z#K5UW zCdZ{fOkm_(^~(#+FosRr)=IImxWUtXitG8bC|# zWa6b;Mb}HKnHra8w2-@F*KI=ay?v6pQk z^8mjLyjS&5yiv42KkG1z*!@xi*#P+$RR{*4i|U(!uSwyPS9}TcvRC&RU?MF8b%y1*g!3|=vXA>g>K5{cs=%%H%RkD(9 z5nRy78y9Fkw5$(3Q$sFZO*URjHr_yTc%zu;UB~!{3q3Cnd`b$Uaf}F@AxSa`%5+Ft z6u{j;s*dT2_ll#vZ^8G-LA_#MhP5!}>AWy?68dOHtE9l7yMBx5_o z$nov;ce>DK^DWlTn<=Bws0X8Q)72RDXlg95UtUh&5Jr?vd@)|^XfzrEZkF@3 zdV?ASUI0#n!{MZjbMtV%0A3SD<>@-0DnUhE7xn80^x)bNDjKHoR8QflPU5c85Uv@7 zf0l5~dC_uUl#}#Y+dQHnH6>G-utw|WqJ&?jxsn2@P-01lr`9ux5_!?%0o0#s~ zB1X@xW4w@bmy~N9ltO9%!xuQS6l-0HYm~t}UW}^THJmTtSdWntR|!<4@KtwH-`uYU z*N#(BH;K1)5^rq+cW?%mKM$S^u2;)U37`Tk9Uwnli8ftJc49T@kw%dmX=9{YzLQx= zlu5aUK}nRu@|` +.. customgalleryitem:: + :tooltip: Tuning XGBoost parameters. + :figure: /images/xgboost_logo.png + :description: :doc:`A guide to tuning XGBoost parameters with Tune ` + .. raw:: html @@ -34,6 +39,7 @@ Take a look at any of the below tutorials to get started with Tune. tune-60-seconds.rst tune-tutorial.rst + tune-xgboost.rst User Guides @@ -161,6 +167,7 @@ PyTorch Examples XGBoost Example ~~~~~~~~~~~~~~~ +- :ref:`XGBoost tutorial `: A guide to tuning XGBoost parameters with Tune. - `xgboost_example `__: Trains a basic XGBoost model with Tune with the function-based API and an XGBoost callback. diff --git a/doc/source/tune/_tutorials/tune-xgboost.rst b/doc/source/tune/_tutorials/tune-xgboost.rst new file mode 100644 index 000000000..6444eadf1 --- /dev/null +++ b/doc/source/tune/_tutorials/tune-xgboost.rst @@ -0,0 +1,518 @@ +.. _tune-xgboost: + +Tuning XGBoost parameters +========================= + +XGBoost is currently one of the most popular machine learning algorithms. It performs +very well on a large selection of tasks, and was the key to success in many Kaggle +competitions. + +.. image:: /images/xgboost_logo.png + :width: 200px + :alt: XGBoost + :align: center + :target: https://xgboost.readthedocs.io/en/latest/ + + +This tutorial will give you a quick introduction to XGBoost, show you how +to train an XGBoost model, and then guide you on how to optimize XGBoost +parameters using Tune to get the best performance. We tackle the following topics: + +.. contents:: Table of contents + :depth: 2 + +.. note:: + + To run this tutorial, you will need to install the following: + + .. code-block:: bash + + $ pip install xgboost + +What is XGBoost +--------------- + +XGBoost is an acronym for e\ **X**\ treme **G**\ radient **Boost**\ ing. Internally, +XGBoost uses `decision trees `_. Instead +of training just one large decision tree, XGBoost and other related algorithms train +many small decision trees. The intuition behind this is that even though single +decision trees can be inaccurate and suffer from high variance, +combining the output of a large number of these weak learners can actually lead to +strong learner, resulting in better predictions and less variance. + +.. figure:: /images/tune-xgboost-ensemble.svg + :alt: Single vs. ensemble learning + + A single decision tree (left) might be able to get to an accuracy of 70% + for a binary classification task. By combining the output of several small + decision trees, an ensemble learner (right) might end up with a higher accuracy + of 90%. + +Boosting algorithms start with a single small decision tree and evaluate how well +it predicts the given examples. When building the next tree, those samples that have +been misclassified before have a higher chance of being used to generate the tree. +This is useful because it avoids overfitting to samples that can be easily classified +and instead tries to come up with models that are able to classify hard examples, too. +Please see `here for a more thorough introduction to bagging and boosting algorithms +`_. + +There are many boosting algorithms. In their core, they are all very similar. XGBoost +uses second-level derivatives to find splits that maximize the *gain* (the inverse of +the *loss*) - hence the name. In practice, there really is no drawback in using +XGBoost over other boosting algorithms - in fact, it usually shows the best performance. + +Training a simple XGBoost classifier +------------------------------------ + +Let's first see how a simple XGBoost classifier can be trained. We'll use the +``breast_cancer``-Dataset included in the ``sklearn`` dataset collection. This is +a binary classification dataset. Given 30 different input features, our task is to +learn to identify subjects with breast cancer and those without. + +Here is the full code to train a simple XGBoost model: + +.. code-block:: python + + import numpy as np + import sklearn.datasets + import sklearn.metrics + from sklearn.model_selection import train_test_split + import xgboost as xgb + + + def train_breast_cancer(config): + # Load dataset + data, labels = sklearn.datasets.load_breast_cancer(return_X_y=True) + # Split into train and test set + train_x, test_x, train_y, test_y = train_test_split( + data, labels, test_size=0.25) + # Build input matrices for XGBoost + train_set = xgb.DMatrix(train_x, label=train_y) + test_set = xgb.DMatrix(test_x, label=test_y) + # Train the classifier + bst = xgb.train(config, train_set, evals=[(test_set, "eval")], verbose_eval=False) + # Predict labels for the test set + preds = bst.predict(test_set) + pred_labels = np.rint(preds) + # Return prediction accuracy + accuracy = sklearn.metrics.accuracy_score(test_y, pred_labels) + return accuracy + + + if __name__ == "__main__": + accuracy = train_breast_cancer({ + "objective": "binary:logistic" + }) + print("Accuracy: {:.2f}".format(accuracy)) + +As you can see, the code is quite simple. First, the dataset is loaded and split +into a ``test`` and ``train`` set. The XGBoost model is trained with ``xgb.train()`` +and the predictions for the test set are obtained with ``bst.predict()``. Lastly, we +return the accuracy of our predictions. Even in this simple example, most runs result +in a good accuracy of over ``0.90``. + +Maybe you have noticed the ``config`` parameter we pass to the XGBoost algorithm. This +is a ``dict`` in which you can specify parameters for the XGBoost algorithm. In this +simple example, the only parameter we passed is the ``objective`` parameter. The value +``binary:logistic`` tells XGBoost that we aim to train a logistic regression model for +a binary classification task. You can find an overview over all valid objectives +`here in the XGBoost documentation `_. + +XGBoost Hyperparameters +----------------------- +Even with the default settings, XGBoost was able to get to a good accuracy on the +breast cancer dataset. However, as in many machine learning algorithms, there are +many knobs to tune which might lead to even better performance. Let's explore some of +them below. + +Maximum tree depth +.................. +Remember that XGBoost internally uses many decision tree models to come up with +predictions. When training a decision tree, we need to tell the algorithm how +large the tree may get. The parameter for this is called the tree *depth*. + +.. figure:: /images/tune-xgboost-depth.svg + :alt: Decision tree depth + :align: center + + In this image, the left tree has a depth of 2, and the right tree a depth of 3. + Note that with each level, :math:`2^{(d-1)}` splits are added, where *d* is the depth + of the tree. + +Tree depth is a property that concerns the model complexity. If you only allow short +trees, the models are likely not very precise - they underfit the data. If you allow +very large trees, the single models are likely to overfit to the data. In practice, +a number between ``2`` and ``6`` is often a good starting point for this parameter. + +XGBoost's default value is ``3``. + +Minimum child weight +.................... +When a decision tree creates new leaves, it splits up the remaining data at one node +into two groups. If there are only few samples in one of these groups, it often +doesn't make sense to split it further. One of the reasons for this is that the +model is harder to train when we have fewer samples. + +.. figure:: /images/tune-xgboost-weight.svg + :alt: Minimum child weight + :align: center + + In this example, we start with 100 examples. At the first node, they are split + into 4 and 96 samples, respectively. In the next step, our model might find + that it doesn't make sense to split the 4 examples more. It thus only continues + to add leaves on the right side. + +The parameter used by the model to decide if it makes sense to split a node is called +the *minimum child weight*. In the case of linear regression, this is just the absolute +number of nodes requried in each child. In other objectives, this value is determined +using the weights of the examples, hence the name. + +The larger the value, the more constrained the trees are and the less deep they will be. +This parameter thus also affects the model complexity. Values can range between 0 +and infinity and are dependent on the sample size. For our ca. 500 examples in the +breast cancer dataset, values between ``0`` and ``10`` should be sensible. + +XGBoost's default value is ``1``. + +Subsample size +.............. +Each decision tree we add is trained on a subsample of the total training dataset. +The probabilities for the samples are weighted according to the XGBoost algorithm, +but we can decide on which fraction of the samples we want to train each decision +tree on. + +Setting this value to ``0.7`` would mean that we randomly sample ``70%`` of the +training dataset before each training iteration. + +XGBoost's default value is ``1``. + +Learning rate / Eta +................... +Remember that XGBoost sequentially trains many decision trees, and that later trees +are more likely trained on data that has been misclassified by prior trees. In effect +this means that earlier trees make decisions for easy samples (i.e. those samples that +can easily be classified) and later trees make decisions for harder samples. It is then +sensible to assume that the later trees are less accurate than earlier trees. + +To address this fact, XGBoost uses a parameter called *Eta*, which is sometimes called +the *learning rate*. Don't confuse this with learning rates from gradient descent! +The original `paper on stochastic gradient boosting `_ +introduces this parameter like so: + +.. math:: + F_m(x) = F_{m-1}(x) + \eta \cdot \gamma_{lm} \textbf{1}(x \in R_{lm}) + +This is just a complicated way to say that when we train we new decision tree, +represented by :math:`\gamma_{lm} \textbf{1}(x \in R_{lm})`, we want to dampen +its effect on the previous prediction :math:`F_{m-1}(x)` with a factor +:math:`\eta`. + +Typical values for this parameter are between ``0.01`` and ``0.3```. + +XGBoost's default value is ``0.3``. + +Number of boost rounds +...................... +Lastly, we can decide on how many boosting rounds we perform, which means how +many decision trees we ultimately train. When we do heavy subsampling or use small +learning rate, it might make sense to increase the number of boosting rounds. + +XGBoost's default value is ``10``. + +Putting it together +................... +Let's see how this looks like in code! We just need to adjust our ``config`` dict: + +.. code-block:: python + + if __name__ == "__main__": + config = { + "objective": "binary:logistic", + "max_depth": 2, + "min_child_weight": 0, + "subsample": 0.8, + "eta": 0.2 + } + accuracy = train_breast_cancer(config) + print("Accuracy: {:.2f}".format(accuracy)) + +The rest stays the same. Please note that we do not adjust the ``num_boost_rounds`` here. +The result should also show a high accuracy of over 90%. + +Tuning the configuration parameters +----------------------------------- +XGBoosts default parameters already lead to a good accuracy, and even our guesses in the +last section should result in accuracies well above 90%. However, our guesses were +just that: guesses. Often we do not know what combination of parameters would actually +lead to the best results on a machine learning task. + +Unfortunately, there are infinitely many combinations of hyperparameters we could try +out. Should we combine ``max_depth=3`` with ``subsample=0.8`` or with ``subsample=0.9``? +What about the other parameters? + +This is where hyperparameter tuning comes into play. By using tuning libraries such as +Ray Tune we can try out combinations of hyperparameters. Using sophisticated search +strategies, these parameters can be selected so that they are likely to lead to good +results (avoiding an expensive *exhaustive search*). Also, trials that do not perform +well can be preemptively stopped to reduce waste of computing resources. Lastly, Ray Tune +also takes care of training these runs in parallel, greatly increasing search speed. + +Let's start with a basic example on how to use Tune for this. We just need to make +a few changes to our code-block: + +.. code-block:: python + :emphasize-lines: 26,32,33,34,35,37,38,39,40,41 + + import numpy as np + import sklearn.datasets + import sklearn.metrics + from sklearn.model_selection import train_test_split + import xgboost as xgb + + from ray import tune + + + def train_breast_cancer(config): + # Load dataset + data, labels = sklearn.datasets.load_breast_cancer(return_X_y=True) + # Split into train and test set + train_x, test_x, train_y, test_y = train_test_split( + data, labels, test_size=0.25) + # Build input matrices for XGBoost + train_set = xgb.DMatrix(train_x, label=train_y) + test_set = xgb.DMatrix(test_x, label=test_y) + # Train the classifier + bst = xgb.train(config, train_set, evals=[(test_set, "eval")], verbose_eval=False) + # Predict labels for the test set + preds = bst.predict(test_set) + pred_labels = np.rint(preds) + # Return prediction accuracy + accuracy = sklearn.metrics.accuracy_score(test_y, pred_labels) + tune.report(mean_accuracy=accuracy, done=True) + + + if __name__ == "__main__": + config = { + "objective": "binary:logistic", + "max_depth": tune.randint(1, 9), + "min_child_weight": tune.choice([1, 2, 3]), + "subsample": tune.uniform(0.5, 1.0), + "eta": tune.loguniform(1e-4, 1e-1) + } + tune.run( + train_breast_cancer, + resources_per_trial={"cpu": 1}, + config=config, + num_samples=10) + +As you can see, the changes in the actual training function are minimal. Instead of +returning the accuracy value, we report it back to Tune using ``tune.report()``. +Our ``config`` dictionary only changed slightly. Instead of passing hard-coded +parameters, we tell Tune to choose values from a range of valid options. There are +a number of options we have here, all of which are explained in +:ref:`the Tune docs `. + +For a brief explanation, this is what they do: + +* ``tune.randint(min, max)`` chooses a random integer value between *min* and *max*. + Note that *max* is exclusive, so it will not be sampled. +* ``tune.choice([a, b, c])`` chooses one of the items of the list at random. Each item + has the same chance to be sampled. +* ``tune.uniform(min, max)`` samples a floating point number between *min* and *max*. + Note that *max* is exclusive here, too. +* ``tune.loguniform(min, max, base=10)`` samples a floating point number between *min* and *max*, + but applies a logarithmic transformation to these boundaries first. Thus, this makes + it easy to sample values from different orders of magnitude. + + + +The ``num_samples=10`` option we pass to ``tune.run()`` means that we sample 10 different +hyperparameter configurations from this search space. + +The output of our training run coud look like this: + +.. code-block:: + :emphasize-lines: 10 + + +---------------------------------+------------+-------+-------------+-------------+--------------------+-------------+----------+--------+------------------+ + | Trial name | status | loc | eta | max_depth | min_child_weight | subsample | acc | iter | total time (s) | + |---------------------------------+------------+-------+-------------+-------------+--------------------+-------------+----------+--------+------------------| + | train_breast_cancer_c817a_00000 | TERMINATED | | 0.00334038 | 8 | 1 | 0.640256 | 0.93007 | 1 | 0.050081 | + | train_breast_cancer_c817a_00001 | TERMINATED | | 0.00285335 | 4 | 3 | 0.951621 | 0.93007 | 1 | 0.0453899 | + | train_breast_cancer_c817a_00002 | TERMINATED | | 0.0597631 | 5 | 2 | 0.96479 | 0.986014 | 1 | 0.0503612 | + | train_breast_cancer_c817a_00003 | TERMINATED | | 0.000650095 | 6 | 2 | 0.923812 | 0.951049 | 1 | 0.0588872 | + | train_breast_cancer_c817a_00004 | TERMINATED | | 0.00753275 | 1 | 1 | 0.973499 | 0.881119 | 1 | 0.0347321 | + | train_breast_cancer_c817a_00005 | TERMINATED | | 0.000411214 | 5 | 1 | 0.672503 | 0.958042 | 1 | 0.0477931 | + | train_breast_cancer_c817a_00006 | TERMINATED | | 0.0940201 | 5 | 2 | 0.711124 | 0.972028 | 1 | 0.069901 | + | train_breast_cancer_c817a_00007 | TERMINATED | | 0.0372492 | 1 | 1 | 0.76303 | 0.895105 | 1 | 0.0496318 | + | train_breast_cancer_c817a_00008 | TERMINATED | | 0.000140322 | 1 | 2 | 0.885415 | 0.909091 | 1 | 0.045424 | + | train_breast_cancer_c817a_00009 | TERMINATED | | 0.000341654 | 5 | 3 | 0.720523 | 0.937063 | 1 | 0.0657773 | + +---------------------------------+------------+-------+-------------+-------------+--------------------+-------------+----------+--------+------------------+ + +The best configuration we found used ``eta=0.0940201``, ``max_depth=5``, +``min_child_weight=2``, ``subsample=0.711124`` and reached an accuracy of +``0.972028``. + +Early stopping +-------------- +Currently, Tune samples 10 different hyperparameter configurations and trains a full +XGBoost on all of them. In our small example, training is very fast. However, +if training takes longer, a significant amount of computer resources is spent on trials +that will eventually show a bad performance, e.g. a low accuracy. It would be good +if we could identify these trials early and stop them, so we don't waste any resources. + +This is where Tune's *Schedulers* shine. A Tune ``TrialScheduler`` is responsible +for starting and stopping trials. Tune implements a number of different schedulers, each +described :ref:`in the Tune documentation `. +For our example, we will use the ``AsyncHyperBandScheduler`` or ``ASHAScheduler``. + +The basic idea of this scheduler: We sample a number of hyperparameter configurations. +Each of these configurations is trained for a specific number of iterations. +After these iterations, only the best performing hyperparameters are retained. These +are selected according to some loss metric, usually an evaluation loss. This cycle is +repeated until we end up with the best configuration. + +The ``ASHAScheduler`` needs to know three things: + +1. Which metric should be used to identify badly performing trials? +2. Should this metric be maximized or minimized? +3. How many iterations does each trial train for? + +There are more parameters, which are explained in the +:ref:`documentation `. + +Lastly, we have to report the loss metric to Tune. We do this with a ``Callback`` that +XGBoost accepts and calls after each training iteration. We also tell XGBoost which +loss metrics to calculate in the ``eval_metric`` parameter. These are the metrics +available in ``env.evaluation_result_list`` below. + +.. code-block:: python + :emphasize-lines: 11,12,13,26,42,44,45,46,47,48,49 + + import numpy as np + import sklearn.datasets + import sklearn.metrics + from ray.tune.schedulers import ASHAScheduler + from sklearn.model_selection import train_test_split + import xgboost as xgb + + from ray import tune + + + def XGBCallback(env): + # After every training iteration, report loss to Tune + tune.report(**dict(env.evaluation_result_list)) + + + def train_breast_cancer(config): + # Load dataset + data, labels = sklearn.datasets.load_breast_cancer(return_X_y=True) + # Split into train and test set + train_x, test_x, train_y, test_y = train_test_split( + data, labels, test_size=0.25) + # Build input matrices for XGBoost + train_set = xgb.DMatrix(train_x, label=train_y) + test_set = xgb.DMatrix(test_x, label=test_y) + # Train the classifier + bst = xgb.train(config, train_set, evals=[(test_set, "eval")], verbose_eval=False, callbacks=[XGBCallback]) + # Predict labels for the test set + preds = bst.predict(test_set) + pred_labels = np.rint(preds) + # Return prediction accuracy + accuracy = sklearn.metrics.accuracy_score(test_y, pred_labels) + tune.report(mean_accuracy=accuracy, done=True) + + + if __name__ == "__main__": + config = { + "objective": "binary:logistic", + "max_depth": tune.randint(1, 9), + "min_child_weight": tune.choice([1, 2, 3]), + "subsample": tune.uniform(0.5, 1.0), + "eta": tune.loguniform(1e-4, 1e-1), + "eval_metric": ["auc", "ams@0", "logloss"] + } + scheduler = ASHAScheduler( + metric="eval-logloss", # The `eval` prefix is defined in xgb.train + mode="min", # Retain configurations with a low logloss + max_t=11, # 10 training iterations + 1 final evaluation + grace_period=1, # Number of minimum iterations for each trial + reduction_factor=2) # How aggressively to stop trials + tune.run( + train_breast_cancer, + resources_per_trial={"cpu": 1}, + config=config, + num_samples=10, + scheduler=scheduler) + +The output of our run could look like this: + +.. code-block:: + :emphasize-lines: 13 + + +---------------------------------+------------+-------+-------------+-------------+--------------------+-------------+----------+--------+------------------+ + | Trial name | status | loc | eta | max_depth | min_child_weight | subsample | acc | iter | total time (s) | + |---------------------------------+------------+-------+-------------+-------------+--------------------+-------------+----------+--------+------------------| + | train_breast_cancer_806ea_00000 | TERMINATED | | 0.0371055 | 2 | 1 | 0.611729 | 0.951049 | 11 | 0.339279 | + | train_breast_cancer_806ea_00001 | TERMINATED | | 0.0324613 | 3 | 2 | 0.643815 | | 4 | 0.230338 | + | train_breast_cancer_806ea_00002 | TERMINATED | | 0.0100875 | 4 | 3 | 0.985147 | | 2 | 0.0661929 | + | train_breast_cancer_806ea_00003 | TERMINATED | | 0.00124263 | 1 | 3 | 0.890299 | | 1 | 0.0201721 | + | train_breast_cancer_806ea_00004 | TERMINATED | | 0.000230373 | 5 | 3 | 0.627611 | | 1 | 0.0265107 | + | train_breast_cancer_806ea_00005 | TERMINATED | | 0.000186942 | 5 | 2 | 0.831801 | | 1 | 0.026082 | + | train_breast_cancer_806ea_00006 | TERMINATED | | 0.00871051 | 2 | 3 | 0.721523 | 0.958042 | 11 | 0.299392 | + | train_breast_cancer_806ea_00007 | TERMINATED | | 0.00440949 | 2 | 3 | 0.606252 | | 1 | 0.0210171 | + | train_breast_cancer_806ea_00008 | TERMINATED | | 0.00948289 | 5 | 2 | 0.892979 | | 2 | 0.140424 | + | train_breast_cancer_806ea_00009 | TERMINATED | | 0.0514017 | 2 | 1 | 0.859864 | 0.972028 | 11 | 0.365437 | + +---------------------------------+------------+-------+-------------+-------------+--------------------+-------------+----------+--------+------------------+ + +As you can see, four trials have been stopped after just one iteration, two after two iterations, +one after four iterations, and the three most promising configurations have been run for +ten iterations. The 11 is due to the fact that we finally report the accuracy after +training the full model, which is internally interpreted as another iteration. + +Using fractional GPUs +--------------------- +You can often accelerate your training by using GPUs in addition to CPUs. However, +you usually don't have as many GPUs as you have trials to run. For instance, if you +run 10 Tune trials in parallel, you usually don't have access to 10 separate GPUs. + +Tune supports *fractional GPUs*. This means that each task is assigned a fraction +of the GPU memory for training. For 10 tasks, this could look like this: + +.. code-block:: python + :emphasize-lines: 8,12 + + config = { + "objective": "binary:logistic", + "max_depth": tune.randint(1, 9), + "min_child_weight": tune.choice([1, 2, 3]), + "subsample": tune.uniform(0.5, 1.0), + "eta": tune.loguniform(1e-4, 1e-1), + "eval_metric": ["auc", "ams@0", "logloss"], + "tree_method": "gpu_hist" + } + tune.run( + train_breast_cancer, + resources_per_trial={"cpu": 1, "gpu": 0.1}, + config=config, + num_samples=10, + scheduler=scheduler) + +Each task thus works with 10% of the available GPU memory. You also have to tell +XGBoost to use the ``gpu_hist`` tree method, so it knows it should use the GPU. + +Conclusion +---------- +You should now have a basic understanding on how to train XGBoost models and on how +to tune the hyperparameters to yield the best results. In our simple example, +Tuning the parameters didn't make a huge difference for the accuracy. +But in larger applications, intelligent hyperparameter tuning can make the +difference between a model that doesn't seem to learn at all, and a model +that outperforms all the other ones. + +Further References +------------------ + +* `XGBoost Hyperparameter Tuning - A Visual Guide `_ +* `Notes on XGBoost Parameter Tuning `_ +* `Doing XGBoost Hyperparameter Tuning the smart way `_ diff --git a/python/ray/tune/examples/xgboost_example.py b/python/ray/tune/examples/xgboost_example.py index 1e7303fe9..10ad88e6a 100644 --- a/python/ray/tune/examples/xgboost_example.py +++ b/python/ray/tune/examples/xgboost_example.py @@ -1,49 +1,61 @@ -import xgboost as xgb import numpy as np import sklearn.datasets import sklearn.metrics +from ray.tune.schedulers import ASHAScheduler from sklearn.model_selection import train_test_split +import xgboost as xgb from ray import tune def XGBCallback(env): + # After every training iteration, report loss to Tune tune.report(**dict(env.evaluation_result_list)) def train_breast_cancer(config): - data, target = sklearn.datasets.load_breast_cancer(return_X_y=True) + # Load dataset + data, labels = sklearn.datasets.load_breast_cancer(return_X_y=True) + # Split into train and test set train_x, test_x, train_y, test_y = train_test_split( - data, target, test_size=0.25) + data, labels, test_size=0.25) + # Build input matrices for XGBoost train_set = xgb.DMatrix(train_x, label=train_y) test_set = xgb.DMatrix(test_x, label=test_y) + # Train the classifier bst = xgb.train( - config, train_set, evals=[(test_set, "eval")], callbacks=[XGBCallback]) + config, + train_set, + evals=[(test_set, "eval")], + verbose_eval=False, + callbacks=[XGBCallback]) + # Predict labels for the test set preds = bst.predict(test_set) pred_labels = np.rint(preds) - tune.report( - mean_accuracy=sklearn.metrics.accuracy_score(test_y, pred_labels), - done=True) + # Return prediction accuracy + accuracy = sklearn.metrics.accuracy_score(test_y, pred_labels) + tune.report(mean_accuracy=accuracy, done=True) if __name__ == "__main__": - num_threads = 2 config = { - "verbosity": 0, - "num_threads": num_threads, "objective": "binary:logistic", - "booster": "gbtree", - "eval_metric": ["auc", "ams@0", "logloss"], "max_depth": tune.randint(1, 9), + "min_child_weight": tune.choice([1, 2, 3]), + "subsample": tune.uniform(0.5, 1.0), "eta": tune.loguniform(1e-4, 1e-1), - "gamma": tune.loguniform(1e-8, 1.0), - "grow_policy": tune.choice(["depthwise", "lossguide"]) + "eval_metric": ["auc", "ams@0", "logloss"] } - - from ray.tune.schedulers import ASHAScheduler + # The ASHAScheduler stops bad performing configurations early + scheduler = ASHAScheduler( + metric="eval-logloss", # The `eval` prefix is defined in xgb.train + mode="min", # Retain configurations with a low logloss + max_t=11, # 10 training iterations + 1 final evaluation + grace_period=1, # Number of minimum iterations for each trial + reduction_factor=2) # How aggressively to stop trials tune.run( - train_breast_cancer, - resources_per_trial={"cpu": num_threads}, + train_breast_cancer, # your training function + resources_per_trial={"cpu": 1}, # You can add "gpu": 0.1 here config=config, - num_samples=2, - scheduler=ASHAScheduler(metric="eval-logloss", mode="min")) + num_samples=10, # number of parameter configurations to try + scheduler=scheduler)