From da1501e67e8f7fc0f4ff268ef5a96433403c2fe9 Mon Sep 17 00:00:00 2001 From: cooper Date: Fri, 17 May 2024 13:49:44 +0800 Subject: [PATCH] init repo --- .secret | 1 + decspider/__init__.py | 0 .../__pycache__/__init__.cpython-311.pyc | Bin 0 -> 175 bytes decspider/__pycache__/items.cpython-311.pyc | Bin 0 -> 962 bytes .../__pycache__/middlewares.cpython-311.pyc | Bin 0 -> 7847 bytes decspider/__pycache__/myutils.cpython-311.pyc | Bin 0 -> 8866 bytes .../__pycache__/pipelines.cpython-311.pyc | Bin 0 -> 4727 bytes .../__pycache__/settings.cpython-311.pyc | Bin 0 -> 1171 bytes decspider/items.py | 21 +++ decspider/middlewares.py | 168 ++++++++++++++++++ decspider/myutils.py | 110 ++++++++++++ decspider/pipelines.py | 75 ++++++++ decspider/settings.py | 114 ++++++++++++ decspider/spiders/__init__.py | 4 + .../__pycache__/__init__.cpython-311.pyc | Bin 0 -> 183 bytes .../__pycache__/corpnews.cpython-311.pyc | Bin 0 -> 7189 bytes decspider/spiders/corpnews.py | 95 ++++++++++ scrapy.cfg | 12 ++ 18 files changed, 600 insertions(+) create mode 100644 .secret create mode 100644 decspider/__init__.py create mode 100644 decspider/__pycache__/__init__.cpython-311.pyc create mode 100644 decspider/__pycache__/items.cpython-311.pyc create mode 100644 decspider/__pycache__/middlewares.cpython-311.pyc create mode 100644 decspider/__pycache__/myutils.cpython-311.pyc create mode 100644 decspider/__pycache__/pipelines.cpython-311.pyc create mode 100644 decspider/__pycache__/settings.cpython-311.pyc create mode 100644 decspider/items.py create mode 100644 decspider/middlewares.py create mode 100644 decspider/myutils.py create mode 100644 decspider/pipelines.py create mode 100644 decspider/settings.py create mode 100644 decspider/spiders/__init__.py create mode 100644 decspider/spiders/__pycache__/__init__.cpython-311.pyc create mode 100644 decspider/spiders/__pycache__/corpnews.cpython-311.pyc create mode 100644 decspider/spiders/corpnews.py create mode 100644 scrapy.cfg diff --git a/.secret b/.secret new file mode 100644 index 0000000..46dec27 --- /dev/null +++ b/.secret @@ -0,0 +1 @@ +o36r0lgw71mdzm9rkwrv3wi1wn|3600|1715909100.492765|oadugxyl9fhoqsamqopc \ No newline at end of file diff --git a/decspider/__init__.py b/decspider/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/decspider/__pycache__/__init__.cpython-311.pyc b/decspider/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..70ca6a758fc66239bd2c7812bd8bcd43ee3b4cc4 GIT binary patch literal 175 zcmZ3^%ge<81nHhGX(0MBh=2h`DC095kTIPhg&~+hlhJP_LlF~@{~09t%gNa)CbT%U zs5mA$BQ-B6#y>CBr6{v3HO9p?ET*_1GbObsCM7i)gB2g2nU`4-AFo$X`HRCQH$SB` eC)KWq6=)>L(qeuf@qw9US2x_A-_OOGC;)Wf!2EmA>o4gPu?o^v?phpMp#zzCVBIH^O={-%$u*(ss`N9_wH~<2KXt0 z%!+HVY;o}g5Fp6~J?$kOWY_~Fe*mOlIOMshohSgO{7p?)kn|iHgmSU@u^rGqPS`b3$CoRNJ*Eb%ryh*}m&yJ8*oDDV#Aa%@gy!!BQ0E?BVVM;|0Zp z8un}Ky)>SBxI>&NHagvXgN_}9iGi?vK{q%Sr< zuQIN2b*8q`qO{2k9`ef#^ci;xMUtdN1*BS76)l1KkL+2L%F^8hTy%W(?|zH%KmUQN GfBprge%tl{ literal 0 HcmV?d00001 diff --git a/decspider/__pycache__/middlewares.cpython-311.pyc b/decspider/__pycache__/middlewares.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9b2308e60b6153f7353af640b623bd230b29459a GIT binary patch literal 7847 zcmeGhTWs6b^->fiN}^=>(bTn*Sh43MauTQOmaH(GtZwAGYZ2Lto3`_uYKllZO7#j! zC9$9Y9*_arU|<_sARgKR`Y1XF?MFWP(Vq?b+$UwQus~oyfo;I@=feF825jeCiZu1G zV+_MEV7nrZF7G|}@SJi0HjfME(;O zA)mr+mIZJy8u?nwuc16l}kULUYo8kb-rt@Yl|P+Mvf__@Npf*ZH`{$Ln3;3cmy6ND9TU3eGQG4-ZsuSHLVW@rNYJ8g50(EE&8kDTRDa zkzn5F!dkN+KMNy_zJi=jNs5waOkO%yD5|CLy=CcyKM0f*9{`ZaPr`i_RUUw+l(l7{ z4w^#92;4h|0lZEaV7Bavk!zlN?y~z$@&LB+nituZ5kWozAj-){u|5RA@PHh}j4DU) zfk`VI{KHyCBBmVAwD6TLswl|W0}Emf7zV1!X#jtNAC;!3f15DAJ_;&R^`73!jQQJU zt>$)NJFJRBEVn}}pL91R71VS-MJI!R#?(D4V@XzF@^|GVWBT zPOHSwRbtER9;;-EV}uknDXR%tx>b}ERcS1g+#=B-TcRfH((?z9r)k8uTUJUSJ&^;h zlx=%}1kN|zKd(UrRPw2|OJo4Yrci=y_X(`_r?#C#4?`_gse7L8w}@$vCL*H>*d$LT zzq}0A$eqqDQs+Z!!7YAZ zlONCq#~&T~e6Th+S>rG0`~}Ur=+eideuGaWkcmXX@Fx=4yjaX&T}UKu6_c4pPau^^ zDoR#TZ{$T8{4$Y`As9t)5&`ZQ$}Zivls4U#vVa{3y5Lqy09NfEmnWR(^+n;m5nnnwh%IlOye5v{Cf?mJPtJFvk1NepxsWM+Fvu8Y-JkBU<^j*IUe=N z=kVb?g6|=C5dk%H7qK>lAcg=hPgUG&U3JRhqc=WJ=;vmE!P?O|{pei9t@DF_>U-|j zGw;s;mahUo(`4BT1cW?rQ+0S7IDycL+=K?RRGlkgHMT7GJ=U2fR-KgX5hC*er2b|0 z1M(5u8itiwcq6VhEw>rjFE+K5S#26tsvQ90ip5 zMgyrjRMBdKpb)CNIXLDxK*fHO=%rK~Y49HM@hF?CtEX|T{EzYaP| zLCKG&A!nwRrJPy{URBb$Kz zWZVvh8&7Ww^e$ZvNN!OFDl^vo6zw+9nI+!8$de z??43)Om1dt{7X<(B|7=lV2kDn<5QJ2P7TDGtPd3l=ZXyY~I{ zycReESG{+zGK)uK-CM%(P2qU;&d23PGx~|iFP3V;v@T3*!t__Y1C{tb)AlNmMbA@( z|MmhC1VWlcWS%-fuDCfLF8LaaF{Qo)wHB#k&{R6Mm(+GFy@n&d1pvh6g)M$~lOL{r z@8iiwUVY@nFOJvvDV?9v_^GeD4plFU3VS5B3laL*<;FeK-cA_u8V7?xjV_79NJ#EqUT*2)5&qba9dJV?KsbB z!GF+hm=WiBOf%IBxQndtnC#(py_9$Uqoxq#CI&RLeCBK2LFkGb?)VAw(&K+XkrBP* zp^6YG^+#~GNaAVw4-s1=l`rPhiO6H*r4+g%W%IWs(KNe}(&0#Q2|f}45}e$oiYgpV zTLYDG{3MW(Zc75`fU5ij02<$ct#D*B9I5u!!lQb4w0a#Z@W#l-3mYSPcpOc! z0FmSRw8js^rGv&NYy1iGsm2dCtmWV)Kd2r4N%blKxc(5+V{d71U(;jnm<=^Psq;yV zPl9z2!g}E7dQlIIZgg)3#20aGxF?sZMKQK$8SSFs&8~pi z$S7tO$t#koLKvqwVl)nuMVTg!<~c1r$e>ek1j%5MN{`{5@RR1j@I@Jw9DSy;0|35& zFwC|v;_X1qZ!P4R57 zjOT7q!E)XXa0~AMSYe}%Q@ryQ#k+*c2Mk4h02u`umpwsL;0e(CA#+=gd=A%`Iw)1#;y|B0NovZd z&$H8Ho5ek{r`U?J)PUMXj~Pd)PASK8)~&OScaxSYOLeDQ#rzNG7gk(4panWf^Y+GK zcNCNyrP2bkU`yI~XJK^}metvk>-OX(_T?(I=G~?I0xQ=VgWxFUdY)1}>mDp{yjM0c z-ITGrR;Ek9-Y(daY?4db$C-1Kzbug0Jt@nx(!uWT@SHr^HO)uEvyrZ`_)Ir^yC(!W zq%%rLj!z3QzfE%$$`M|37qZjBJP?>UF{-hASPnmV3`@}1IeGkm#*W2#L1Sj52@&N+ zEJKpl+^1$2Zp=nRA;GqHy)y3iYEDtOIwwf7q_MMcNfwcPL{t=-N0P(xoD?E`dQb7u zp&MhutQ?8QG{;yxCJQlHv-1KWvr`hFxoAAhOBw^+XsjHW5&RB~7Bxl`B(VbfMQ#zR z5LDv`TnH;k$ayV#`u6C+!O;tnAWEZSlS1su=x|IJ6eHJ!(ZQjMqta}I7sOE*b3<-s zeol@=rLNg|tunNJBgIx20Z&Q#Gk_Ge>a4h3mTh|dPwxnSntMMx`~f^`6y-Wn9ym~KlA<{h5&)tmFF zdF=YfnQEoHT`h0VI@|Nrb&J=Q4lTd0R(GXNua;FTRxe&$YF-{uD?603PPMEvbu90# zx_f@<(w(=L-d4OVsiC~5ez`7FoAq>L*^XzhOlos&QwNN3k1}f;nW$K7L4i`VO88GG z)F~NL4ZR*hE4MG8`GB9bJx#?tZB!2>vDa-km`l`knx-yMH2jiuhG?f!e|nC$S*2|h z7;?b|GS4n>vY}x>kFnNz78+xvC7^4bNq}KA>x!wOIb64rX)+PXBYR78OL7HmGcAKr zEhg&@jKX^KnZ9s$S?lf;cGp?#&-?`qXTy0X88Ul4&zAD$*&)`tVlZ+4yVJ0Ne&Pb+ zR_IBilTUs}zSsROjg8L=G0mf!!_e8I=U>-YQ3&%IC(TA9vc`>r(95C^8tA$MWliI* zixFAy+ckPza}#A2ip0j_CD|8Suv&REEEzg4!b_kr3z|UOj`XS!4rNh_Vh<2dHA^?Z zW`Hca%KmP8d7t9irnyv)nh| ze6tz)tG$O;_V(xY_J6)l*?U~wdt70=Q>Pa@mfJI1R~_E;g`XyGCGQH$$L`E5%_xpG zPy&wK|0qA4|MRRN3Y^+OM4f`mxa}`%DcOk zC8exQEo+1EU1#ac)x5Xt?~niN_~PlzKBcDPqdKLg`_ne1<|mIhrRK=zj8Zf3<)l&* zP`tyccR1@Eh806Qy|oTu(SU(fufv<`nDhGLH?91Yr#{QpOUNj{sNK)}o;~FFH~Ns{ zkKCStKIV^o?4Zr@9~=#EbM?~?hX(2gS{GBv+1Xspn4KOglZ#u-LT?qu1u8ae7IrI@ zh$^)}FW6)>k%eR)X%j`rB$>^fq$E`kRk|deNg}X1UPjwjNdE_-N`a^{0kIQ$6x#uS zklNyCzQgh)n)e78bJ34f6cu9o#FsI*2Lal5Kf6v4#V)`~NJ2BL%U;C>EBXr1cVNY@ z0D$B&y7wr7!aEe)pJ=!{Ie6w49b^TLXK$@DFhRRupEBHSye(O| z6K+^|cXINml$|2FnsaFRzW&3_uOB`7uYdm6z?j)|;;~`**?$AL2Zl9Cg&?OVvvHp) z&KJyl$+_iiy1^57%Ep|)?ATz?kbFdGTyWyq5=1Y<w18V006!Ueyd$FI#G8dJ)-D=(L^znTCR%q$X`}$XWCvv_M zk9Yj1KYQVldLgX%uBg5%X;yVMWyvG9!B}IUj*WzQ!bqsEfRU)up`4>WrUM_-;!&W$ zryo8NRxjuZuX!U9jh6M9Ne8hVI^rqnt1Z>)mhD;Rc6}ZOYXTATWF9q;SwP~l1HmwQHn#>5VRaD^dtq22B7VDwryw3ea2Nq@lz0R|AA;im%xCKe zh2TC1Y+GYF)&h1AYo7oB_Fz+X-J$Ls`0_2K^NiYg206B6huYG!(sCfza^O*&((;+UO_4@M24=q_|3*iU9Ef4^^1~L;1&j_KA<_?8s;{03`^WISC>RdQlpuqh2_*f_; z4!|rl=d1`0t0>QtWodRvkYzy>PXp~^igZn^8&RqR5X*;FIc5CvY;|gQ&CYRS;6!bJ z6_mYv%|>%z?f^Vxpm=LUM>r{amC7|>0i8Fl4XKR=#>sHo*Qkv^pBAe5F?fws6p3fM z0cEx>U>g|T29V_$fg87hg<-*FVARdQ+u_h1n(M3>zcGI{9*mu5E% zc*xKN=9rV2x6E7%c!RgiPL$P@K`q0mKrGAT2>lYACg4?r3S|b$#vGu`{>lOEjXU22 zXrvGcZo=f@c0&w6Vox!m!p};Z1XKXq{i+c}Ig;>nNRf#c#5F|WNP_E#&CMK1)OXCv zM|wJRt$qX}86A=^d!)zj5z!K9?zveAvO{+8x|@J)bA!+iwGAUM_KGa$T=*aVUx>j9TK_@3RKD z*)ynk;melXFv87#T5|xi#$&u>ky~6HM$CvRxJ%!IolH{iRFF6+xeNmh(%%+bx!;y2 z={vUDmCz@xm61F4L|i7pHt`(P16Re?^&^d*)tp75gd5w^k|NsiTlZn)P>4&;_9fa& z9EmrAcueRz9F32Kqf#Gaiq(S9TzVP6Hz_LH^p^f)Dj!uodR6T`^>{MZdokO4ak&S8 z{^&~YGPKY*A{2h7RxZRO5X?|Pyfi0@9fk-sCKkW0xiHHMQ8_GLf_CC4Hnxl57$2X} z>|>Mh$d~|En|?%lLM2Jl?0P{Q14{9h^BP&H<|wQwQMqB67qIqk0Kk&+PVb7dIp=Iv zoIcg*%Q}7ex`wnTU*%h)Y_1Ww^lVe>UtX!-ovYui)OV@%UH{Uc_6$58R8PN??Ri(} zc~=Ek|8DyAd~tJNi!%JL4pAu64iC+N-wqrtRqu6z8`5mfFRM+aF||TlF0kV8UZ2-Yj`jmx1yj@J^Zr zS)hq507g&iWdSs8k%ZE1$f$vYj8!r*;spF=3;aOFC4y~cToG4NU{^7W`6~zzYhhNA z2TWo|>G(upCVp*GCdFQ)P5^{S&t#kMAxyge4^5xX<$6zNdru>e;ujvWqu^p9qlx1H z3ht&p3EbqC%Q_p`^+6LA>!)!UDoF_eu)XtRk&_Wb82NhGb=fjG>6l6(O1eb1nVB z^MYFILcb7FW^TcKc0F^#tZVsc$yv?b(iZf~90#(ny92^pI(H#pN6E+YJ3 zIT9~miGD_3x}cr6p>{S$f=jn4h|PjE;u)TWANK$j@QJYq-hST(y1NBr$r1Srpr4@2 zBjSNS!C{_YopcsJiUQ}L z`4~D12FzX+GjDv>p!S}8{NwB!=du?rW=DeA(RZ_#uVnctrFUBGoz6B#;W4+;^SV$u zH6|*=bjcclQAE7!X$)v%@og+NVG)8^@GleZXLOxj9uuyc)VdgjIt5kGqI>lyqltzW zEt9B@-$4~{q{=H7cPw=)<*jOYYv%l~M|0&p+47!MU;F*hk9IuVt@!qyY0qShr*5!R*m@sB;1#Xu6 z(HgZ8=mX<$c!!$_gNGdQH|YNopd5(%5Ri)~o-{dq!oivoZ)}m+gjD(`l6c|7^A^$J z)?=Hkm?HsB-Qza?oYb#&WVyIinrjwa?pe4Bp`!r-8-2IX4=QcXx6Z@*^|DV)0Rfa1 zB?SOND4nL)>=fOWrCNX3lcyT9n>~3dlKp-UjK8HRV> z05y#?TsAj?#onigeO@9R*ha${ZUj#e_q;?Hq$$>$PUM)zEYp~;Xk2vN@ho{#C-ST- iHI(+RxSMnCX2tDO-9ClgsD%MCVtyJ}6|BPTI#8=OqiC+n{ zsMlxCJ@;|WJ@?#u&pH0Gx!H!GsDJTK@3tfKFWjgUdV_c|0mKsG5l@IHLO@N5WRi+d z1mY>47U`rZV$x}oXii!pmLwBllGcbdX^YsB_K2N8Bszz9^KHai9+C(>hR-OGCY}f( zH}eB-2Hb?k_@s1ZP!WZqEJKQF%W%yF)qoL?%$UhSB4*M(Z3*J!ZVsON# z*^Ke?!BDtDUkHwbEAxEbu(#hE^2d#}4UCt;MI zs7c%A6XJ3v!3)xbL`D!3DIs3n4MyLusn?BBZp`=qJQO6WA3#qvTMuDj|nvs&w93nNeRqgvR4)2k=&B<%pT;>Up>y zTMJJ6Tnr~^9!_0fyCwtXO76%>0RkXh^Il%>MY>eB$q%iUezn#TH~xy zys1$JZ(gPvwR-J@vqO_yPs41?nBcj=K$!E6hJ(?; z0Pqj{p?P6saM(L?iTkDhQUgbAwQO7HeDESS5F8pE4urU~Bf(*A@-;CTuRe3wJ`NA~ zqQ2+?w1lp~gSc#4AaW2Oc0-R*?vi)xU=TvWF7FX?lN#wcs!>vUUdBBf_j+L9)Wj%Q zN}h;c6H-$X!Ia>W64!+ZpMPw^@U#Ij@pLkCCVO;?gZ88| zLMm#A^7M)6cubVfc&Z%xKo5Bjpj$|F`VGj-pAY1T`)8{iFJK&Ho4rF}|H#G(OlDgq zIj_&RVJ0{O^CTDckB74@UY-|tE}j zNixi5z(h(Sbye1=tAe7@_>O5dU7@H>Q)2q6W)W@(@i|4%m}#)+wU{Vs6fjCSv}oq( zgdp;=M$Za2We}WBCKC$2>arkCYi7g58Y|pL$atlwVF6idNCM#duXE#;&R zEZkmqOxe!=dWzWl%g5jX-9rCv-+eE*-hODU{m`Gg{^9v&H1Q)sbHo4%)i1~}skR?_ zj?j)#@}-U!={wevqvKv;-EnZuaj@X%DLQ(f(_S{CF4z6&dgsx#&ZAE|3Y~q$&OWtc z2w?U4vtJn$Ol&PNEuY#xv8ipx^1cT_r~nGg@gj3vWsZXsYx5oZI^$epoMl9JHkI5* zp6o4CK)`6Wis1k_T;r0(%Ob$!zdl>r6DUu66$<1guVbA^90yYA^<^Yj-y zzM{tm7qsB?7h#Oq^`jZ>axR~^|KYl`cg@*baK2x3zF*k+L2>5?i%p=}&h9dztYN~y zi)Tx%JJil2YAc2k+kVf!&UUY{-Ks10)b~vgDh#1Z8-y!XV5f@gl*&$(SjUp>u1$50 ztiTHy0a$fC>oq9A1vXSU*Z?j&+^cwZ`l! zFdapvL#=>>AA3XAhFy`KwVeQB2}gFQjgqf-d6mvhk$l7J69G3VVQ-?D;$m7Bpyh#< z)C*|C+Zi{#Ngv+uNZGEfSG|7x80N+glJRA+mn{n>D?U2x4)v19R@hQo=_|8<5V;7RnFPL-WTT)$1&is}nxu(lIB&7tZ zPeS=OO8pH)^ab;;&GW=Ox5-}utC})WIxfg^6pMJ% z3$7nO3EXlIJVA(7_8p(`%jXOB14a9R=cvirPkis_SW4VY;kdCyBy=pB9`aip~>@L#6HGdDnwpb^AEL)2l}P_dP}p zcyXXio16QII_H`n__}?C|I_SOvyYNbk_C5P(cM>IPZ!zKDtj8wv#zt;8p{>f{Y7@a zs>7@A+IHRhu-MiEUL2&=rz?QcaAP~@6T610t8TS zc#96N%6eZxDO<2?OxO_HzG&Co-CvJ;+XigtTU)#DAoXpRm-6kS{bY*PXD0|Doy#zc{%c(5>pP zOm>?fd{@yBL6|zqsEQcY)Luqgks%&Sb~kISddB9dCT#lM)V9fSDVDkE5pKkVjFL#F zWc^NRCVZ1*{q9TMST*d%>asfQrHs^0SG;jEubDo>24|!pn1{W+{98a}iXezGi-?^n mau`rT2h~O>A*b31B{ZSF8g2x7#>5|H3XdSou*RLuCh@h?OGeM%O+J-)xne0E!bY$lw^ct8ODHj2`08_ z0*8u!(MvBqwR`Gc;fUr`si&wHq`mE_Lm=$#$a>~^pLst$^JgLv19biR_i5`74uF5> z#{CK}$-CE-{01;U05fd{F_=Li9716n#%m6P)^G$rzz=^9p$KNt0~|#UaSX9|9Yt{* z#c%?xV-Cgf21?*f#L;~NC($NOp(IYD6waVD&Y}#?p)AhpV1J85vEB6h4z|5faz*)t z;-8Klf6y)+jJx)=<=H;3dhX4zvJ#w~5CF~-hQbXUWk2+BT;J*2U8hH*#QRd}QX1u( zo^{=|8hmN7uMWZm{^h6`TswYXW8Ucnw%4-*-tvdNrsI0H)eKyZ7Z#>&ckQ{YE#y&T z5VW2ik&J8MLHlOdJ#KZ}E8n`ja{Ep4pn zG=+NW#<^iCC-8#gM5QK|72z&Q62&sC2^x*$Bm-7VRZ?MDlBvC`!B5UG6(}2mAt^H1Flv%+K+)8oP%;#aWKJQq;JK=3h6&}8BGPce>ae6iLlQ}J z!G5BhuvHyucT=#cpzAe76G>Xtl=BN!QOagj6$Jyz)e1>|5G#^w&ZXR^l33Li{cx2F z{S}S9&%Cr(`D2Uo9~9jnOFr+EEd6Lj;Y83O$(G8B4wrvY70zHusmjJ%@Q&U9cz;s* zre5`J&#yN-cCS%adba2}zu0vV{#;+Y{QBzIf5dz{2%KKq=ljF=?6;RL9(3*Rygiz7 z{xtlz6lP(DVP+W+&yU%EQjaIey)idU?@hqwj036sn45jifXI`-^jSEx9h!jLY#ZJL{)-97%vcbAG-ju9ejJ^yZ&9|MrI5={Kzf?V qPC#<{+4ejg3sPIOWb!KtiOxkIzEVD3VF=byJryZs+vd~DSK literal 0 HcmV?d00001 diff --git a/decspider/items.py b/decspider/items.py new file mode 100644 index 0000000..5be5d76 --- /dev/null +++ b/decspider/items.py @@ -0,0 +1,21 @@ +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy +import scrapy.resolver + + +class NewsItem(scrapy.Item): + # define the fields for your item here like: + # name = scrapy.Field() + title = scrapy.Field() + date = scrapy.Field() + source = scrapy.Field() + content = scrapy.Field() + image_urls = scrapy.Field() + classify = scrapy.Field() + collection = scrapy.Field() + url = scrapy.Field() + source_url = scrapy.Field() diff --git a/decspider/middlewares.py b/decspider/middlewares.py new file mode 100644 index 0000000..828e145 --- /dev/null +++ b/decspider/middlewares.py @@ -0,0 +1,168 @@ +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + +# useful for handling different item types with a single interface +from itemadapter import is_item, ItemAdapter +from .myutils import ProxyPool +from .settings import USERNAME, PASSWORD +from faker import Faker + + +class DecspiderSpiderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, or item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request or item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info("Spider opened: %s" % spider.name) + + +class DecspiderDownloaderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info("Spider opened: %s" % spider.name) + + +class ProxyMiddleware: + def __init__(self): + # 初始化代理列表和每个代理的失败计数 + self.proxy_pool = ProxyPool() + self.proxy_failures = {proxy: 0 for proxy in self.proxy_pool.proxy_list} + self.fake = Faker() + + + def process_request(self, request, spider): + # 为每个请求随机选择一个代理 + proxy = self.proxy_pool.get_one() + if proxy not in self.proxy_failures: + self.proxy_failures[proxy] = 0 + request.meta['proxy'] = "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": USERNAME, "pwd": PASSWORD, "proxy": proxy} + ua = self.fake.user_agent() + request.headers['User-Agent'] = ua + spider.logger.info(f'Using proxy: {proxy}\nUsing UA: {ua}') + + + def process_response(self, request, response, spider): + # 如果响应正常,返回响应 + if response.status in [200, 301, 302]: + return response + # 如果响应异常,处理失败计数 + else: + self._handle_proxy_failure(request.meta['proxy'], spider) + # 重新调度请求 + return request + + + def process_exception(self, request, exception, spider): + # 处理发生异常的请求 + self._handle_proxy_failure(request.meta['proxy'], spider) + # 重新调度请求 + return request + + + def _handle_proxy_failure(self, http_proxy, spider): + # 增加指定代理的失败计数 + proxy = http_proxy.split('@')[-1][:-1] + self.proxy_failures[proxy] += 1 + spider.logger.error(f'Proxy {proxy} failed, failure count: {self.proxy_failures[proxy]}') + + # 如果某个代理失败次数达到2次,从列表中移除 + if self.proxy_failures[proxy] >= 2: + self.proxy_pool.remove(proxy) + del self.proxy_failures[proxy] + spider.logger.error(f'Removed proxy {proxy} after consecutive failures.', level=spider.logger.ERROR) + + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) \ No newline at end of file diff --git a/decspider/myutils.py b/decspider/myutils.py new file mode 100644 index 0000000..d0b7d2d --- /dev/null +++ b/decspider/myutils.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python +# -- coding: utf-8 -- +import time, random, os, sys, json +import requests, datetime +from .settings import SECRETID, SECRETKEY, PROXYPOOL_UPDATENUM, PROXYPOOL_MIN_NUM, PROXYPOOL_MIN_DURATION + + +SECRET_PATH = './.secret' + + +def _get_secret_token(): + r = requests.post(url='https://auth.kdlapi.com/api/get_secret_token', data={'secret_id': SECRETID, 'secret_key': SECRETKEY}) + if r.status_code != 200: + raise KdlException(r.status_code, r.content.decode('utf8')) + res = json.loads(r.content.decode('utf8')) + code, msg = res['code'], res['msg'] + if code != 0: + raise KdlException(code, msg) + secret_token = res['data']['secret_token'] + expire = str(res['data']['expire']) + _time = '%.6f' % time.time() + return secret_token, expire, _time + + +def _read_secret_token(): + with open(SECRET_PATH, 'r') as f: + token_info = f.read() + secret_token, expire, _time, last_secret_id = token_info.split('|') + if float(_time) + float(expire) - 3 * 60 < time.time() or SECRETID != last_secret_id: # 还有3分钟过期或SecretId变化时更新 + secret_token, expire, _time = _get_secret_token() + with open(SECRET_PATH, 'w') as f: + f.write(secret_token + '|' + expire + '|' + _time + '|' + SECRETID) + return secret_token + + +def get_secret_token(): + if os.path.exists(SECRET_PATH): + secret_token = _read_secret_token() + else: + secret_token, expire, _time = _get_secret_token() + with open(SECRET_PATH, 'w') as f: + f.write(secret_token + '|' + expire + '|' + _time + '|' + SECRETID) + return secret_token + + +class KdlException(Exception): + """异常类""" + + + def __init__(self, code=None, message=None): + self.code = code + if sys.version_info[0] < 3 and isinstance(message, unicode): + message = message.encode("utf8") + self.message = message + self._hint_message = "[KdlException] code: {} message: {}".format(self.code, self.message) + + + @property + def hint_message(self): + return self._hint_message + + + @hint_message.setter + def hint_message(self, value): + self._hint_message = value + + + def __str__(self): + if sys.version_info[0] < 3 and isinstance(self.hint_message, unicode): + self.hint_message = self.hint_message.encode("utf8") + return self.hint_message + + +class ProxyPool: + + def __init__(self): + self.update_num = PROXYPOOL_UPDATENUM + self.min_num = PROXYPOOL_MIN_NUM + self.min_duration = PROXYPOOL_MIN_DURATION + self.signature = get_secret_token() + self.api_url = f'https://dps.kdlapi.com/api/getdps/?secret_id={SECRETID}&signature={self.signature}&num={self.update_num}&pt=1&format=json&sep=1' + self.proxy_list = [] + + + def get_one(self): + self.ensure_min_num() + _proxy_list = [] + while not _proxy_list: + last_got = datetime.datetime.now() - datetime.timedelta(seconds=self.min_duration) + _proxy_list = [p for p in self.proxy_list if p['last_got_time'] < last_got] + _proxy = random.choice(_proxy_list) + _proxy['last_got_time'] = datetime.datetime.now() + return _proxy['proxy'] + + + def remove(self, proxy:str): + self.proxy_list = [p for p in self.proxy_list if p['proxy'] != proxy] + self.ensure_min_num() + + + def ensure_min_num(self): + while len(self.proxy_list) < self.min_num: + new_proxy_list = requests.get(self.api_url).json().get('data').get('proxy_list') + _proxy_list = [{'proxy': p, 'last_got_time': datetime.datetime(2020, 10, 1, 12, 30, 30, 100000)} for p in new_proxy_list] + self.proxy_list.extend(_proxy_list) + + +if __name__ == '__main__': + proxypool = ProxyPool() + print(proxypool.get_one()) \ No newline at end of file diff --git a/decspider/pipelines.py b/decspider/pipelines.py new file mode 100644 index 0000000..9a523e1 --- /dev/null +++ b/decspider/pipelines.py @@ -0,0 +1,75 @@ +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + + +# useful for handling different item types with a single interface +import mysql.connector +from scrapy.exceptions import DropItem +from .items import NewsItem +from .settings import MYSQL_USERNAME, MYSQL_PASSWORD, MYSQL_HOST, MYSQL_PORT, MYSQL_DATABASE + + +class DecspiderPipeline: + def open_spider(self, spider): + # 连接数据库 + self.conn = mysql.connector.connect(user=MYSQL_USERNAME, password=MYSQL_PASSWORD, host=MYSQL_HOST, database=MYSQL_DATABASE, port=MYSQL_PORT) + self.cursor = self.conn.cursor() + + # 动态生成表名 + self.table_name = f'{spider.settings.get("BOT_NAME")}_{spider.name}' + spider.log(f'Dataset name: {self.table_name}') + + # 检查表是否存在,如果不存在就创建表 + self.cursor.execute(f""" + CREATE TABLE IF NOT EXISTS `{self.table_name}` ( + id INT AUTO_INCREMENT PRIMARY KEY + ) + """) + + # 获取当前表的列信息 + self.cursor.execute(f"SHOW COLUMNS FROM `{self.table_name}`") + existing_columns = {row[0] for row in self.cursor.fetchall()} + + # 获取 NewsItem 字段信息 + item_columns = set(NewsItem.fields.keys()) + + # 添加 NewsItem 字段到表中 + for column in item_columns: + if column not in existing_columns: + self.cursor.execute(f"ALTER TABLE `{self.table_name}` ADD COLUMN `{column}` TEXT") + spider.log(f'Added column `{column}` to `{self.table_name}` table') + + # 删除表中不存在于 NewsItem 中的字段 + for column in existing_columns: + if column not in item_columns and column != 'id': + self.cursor.execute(f"ALTER TABLE `{self.table_name}` DROP COLUMN `{column}`") + spider.log(f'Dropped column `{column}` from `{self.table_name}` table') + + self.conn.commit() + + + def close_spider(self, spider): + self.conn.close() + + + def process_item(self, item, spider): + + if isinstance(item, NewsItem): + # 插入数据 + columns = ', '.join(item.keys()) + placeholders = ', '.join(['%s'] * len(item)) + sql = f"INSERT INTO `{self.table_name}` ({columns}) VALUES ({placeholders})" + + try: + self.cursor.execute(sql, list(item.values())) + self.conn.commit() + except mysql.connector.Error as e: + spider.log(f"Error when inserting item: {e}") + self.conn.rollback() + raise DropItem(f"Error when inserting item: {e}") + + return item + + diff --git a/decspider/settings.py b/decspider/settings.py new file mode 100644 index 0000000..24c0aef --- /dev/null +++ b/decspider/settings.py @@ -0,0 +1,114 @@ +# Scrapy settings for decspider project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = "decspider" + +SPIDER_MODULES = ["decspider.spiders"] +NEWSPIDER_MODULE = "decspider.spiders" + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +#USER_AGENT = "decspider (+http://www.yourdomain.com)" + +# Obey robots.txt rules +ROBOTSTXT_OBEY = False + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", +# "Accept-Language": "en", +#} + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# "decspider.middlewares.DecspiderSpiderMiddleware": 543, +#} + +DOWNLOADER_MIDDLEWARES = { + "decspider.middlewares.ProxyMiddleware": 543, +} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# "scrapy.extensions.telnet.TelnetConsole": None, +#} + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'decspider.pipelines.DecspiderPipeline': 300, + 'crawlab.CrawlabPipeline': 888, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = "httpcache" +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage" + +# Set settings whose default value is deprecated to a future-proof value +REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7" +TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" +FEED_EXPORT_ENCODING = "utf-8" + + +# Proxy setting +SECRETID = "oadugxyl9fhoqsamqopc" +SECRETKEY = "j2gugufp2batb8y2olw9la1cptxfapko" +USERNAME = "d2667352953" +PASSWORD = "m93ih6hh" +PROXYPOOL_UPDATENUM = 1 +PROXYPOOL_MIN_NUM = 3 +PROXYPOOL_MIN_DURATION = 1 + + +# MySQL Configuration +MYSQL_USERNAME = "root" +MYSQL_PASSWORD = "yGWptA_tX4bZ2q" +MYSQL_HOST = "10.18.30.148" +MYSQL_PORT = 3307 +MYSQL_DATABASE = "crawler_data" + + +# Test +CLOSESPIDER_PAGECOUNT = 5 diff --git a/decspider/spiders/__init__.py b/decspider/spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/decspider/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/decspider/spiders/__pycache__/__init__.cpython-311.pyc b/decspider/spiders/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..503a621e62b9edde77d0f57a3bcb9de304a18f21 GIT binary patch literal 183 zcmZ3^%ge<81nHhGX$yh$V-N=h7@>^MY(U0zh7^Wi22Do4l?+8pK>lZtTZlX-=wL5i8JWkmbevK;i>4BO~Jn1{hJq3={(ZuAMIb literal 0 HcmV?d00001 diff --git a/decspider/spiders/__pycache__/corpnews.cpython-311.pyc b/decspider/spiders/__pycache__/corpnews.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8cc9c1a726729256155efcec6acedd14054bf5c9 GIT binary patch literal 7189 zcmbU`ZEPFImAm|0Qj|z5G-)Z4MN=OZZHuz&M9H~|Yeixm%d%uk@)skUrnoDK%5q8X zt}KbAE(R{Z5rukn3?G!b3Csqrqn-_%f8>gy$dBNH0{v59SJ=YD0tO5mF1Y+U1#ST& zzxrmDTyiDTaXTEnotb^}=FOY8?_=~?eZ7-_;`=f(WAYHh|DaB}SWAtkIcTgAG(nRr z5!Ie#)C7GKOQy_GGl|B`tR-cQTA^*Bt*k9&kJ@qD#!@Lq)RA&VovFHLUCI@8rRt;g zBw-?s6SVyUf~FW^zM;J6Ju^Yvhd-@E-84BtgdE?X?r4Zq9SqN>`9zv#pkrU)+4*#m z`^E%sp-+u6i^2(!NrkLx!>NlCXGUUYCnDpc1E(V8{^@~-(&i7SsFk*S zKtye{6`-BA0i^Q85*c;CcI;}yV4BZx!0kjPNi)1kB})CQi%xU1GjVPlmswK|P z3aahBMf`NcVhJ`b2(g%ehzA0J>)#Cz9+^5TFuX98m}9u|^D(X8Jn*l?Lmq+h}t0G?eZiqwIhS%1cTM9WmWg6e^FTB6>1 zI0Hbdk6YS)5&1ZhwMiT;XZ7yaWeB@@wC!G@Yxkp&NPpd2p6%V+#-niSIX`8@LBRbIilb# zk5xZuJKhzweOKcC3f>EN4cDaXz?7Xi6Uf-IRc0?o{UX@u71KYPI7)jil8eNW`697M zg1jqQ)T@v&2ZE8me<4BsiTHR$}F?gp?%%z(n=lGP2I9Ul<_aj0B?NoB<40I(U(Hb;=J49oz_a|jz_&G6|IP&QvV5(t^I`{qP3 zBOK}NU0hrYS9_k`rN!)0@4UFYc&PX4ba+loWxKuyH$jUpgO?Js0>TS5tJc)AaFtaZ zi8RMC2{Fy9_EM8a)|E#p$s;?$BV)uP+oW0$hH6VJ@B-jduu&2ACaC6Fa6=X@o?=ux zv&1A8L`J3X*5fS8BLz^cSRSesd>EJJQH-H_BIB{qvDn1f!FOWd@JAx5nN82~jTk+_ zra|d!0!TTttlG!5vsJB`I4>|ED~~LI>O$HHT*UCwW}vM)HC;fiP&Ffn3^lmBk`}2A zMqSCazEG7A>%wor;{G4}t`q<3ZMrq~$h&{TyZ^s-|0(?ESn{s~u|N)*;4y5PG)dn5 z-x9>mv!=gbn76DtioWJs$w$7f4PTe+>sEZ-Fj%)~BVKL$xaU#x{*C7S>%(&M8%pyV zl0O3VcO74Zv{v4=X>&OC6?nZ7{6we;Xv*$?}dE_B0HH|?1QqZ2V z=ak|(C3#L&&v;fYoAtm0bT%6aNByef=kA}nWvWT@_#hqM1=A)3#$}jy7GG)IBuK{( zFvt%+ZrpV%44*H3cV2Ehq%;E-YoMzxS4M;2gkd!b3A3jpUu6m+*-jZedyO8f`TV~YzXAs*z9gX=>d z1TM6>Vi;G{lr&nIa>|>?KSzg8&`e~|F6}QsWoSLo+SU#{jN^e+_7!TIa4``Kaitp>=uHA z=LdZ`2WWvF+!=SS$sD<1?27H9*+ad(bn@D}$8fyqt{I#U zLZ$EkLV{OIVk{G%Wz>4Ti4!Dr4%i=HfI|z+MAo~-6JS0M0=6)vJAmTO*0swwFH0Tg z<((Ilofp6`QLdj;Kc%D{uNOL$;9+<|m8m|3>XWFxV%^S7!s0ka7F+i!t>N`+N^4(! z^s#^UM@t{))-TKce#PIPf2Zj7e2A4X4xWeuFM}~g+JlX;4-8`|zpFB^#&{UaFImICI@TAiv(@-Ov|m7S*;Z(~(M-`)#f88s4!laOxlZF; zWQh44Im;Frb%+M-0&4ZB#ht1todzSlmy`~sUz4WWNVDPwX^&y2n75i#b{MIwSn0-s zZJRLBK3{@s-Qj)k9s>XL0-s=1a2i?*+3sxDi>pt1(ZI3%?`l++fWzr7gM=tI-0e}dpU01q70*28^{JRsi4ntoVA8r`sBNg8e>jd|#6r8?P%{d-Wd4_B9b0wnM@py(1vR!e}t z0I#xt-`1yKih$G)w%&7It6%O}k3X1uxbPsY4`~~BCQ}d};GiI6=aDJk2M|Dqg4JC$ z@eF2AwV<1BKtfq39|2H`J(6(WU2bwSY1LIyRyID(uxbN7C-fc0aFF80{mQBWipoo| zh!%(p!a%mE=4l^?*TTQ((a2+O%ZJ0akOyw|L()^aysRmg=1<(1lU;4au5M_tva4NkwLdAdbWsjQm0+%&E){=1`y-O#WCu_okHE0 zUDwApZ6<3F;;Zez*=&VGS_enGR*1uN5cXSlZ4xg6c!GBFkcS$@l z2HGu;3k)}#oDO4ih0_Aniar}b`{H^K;+6|16r$r>C)K9?iw3WSZKk#~z7M4OHEAE^ zIlxCE5`GT=vVW2+67|>duSoc$axD^_lCc(vfMl#iqD3;+B5_H2xw>gPOOhbE=ffN4 QZe98ozxzuK47|6$0V-6whyVZp literal 0 HcmV?d00001 diff --git a/decspider/spiders/corpnews.py b/decspider/spiders/corpnews.py new file mode 100644 index 0000000..ee68a94 --- /dev/null +++ b/decspider/spiders/corpnews.py @@ -0,0 +1,95 @@ +import scrapy +import mysql.connector +from mysql.connector import errorcode +from urllib.parse import urljoin +from ..items import NewsItem +from ..settings import MYSQL_USERNAME, MYSQL_PASSWORD, MYSQL_HOST, MYSQL_PORT, MYSQL_DATABASE + + +class CorpnewsSpider(scrapy.Spider): + name = "corpnews" + allowed_domains = ["dongfang.com"] + + + def __init__(self, *args, **kwargs): + super(CorpnewsSpider, self).__init__(*args, **kwargs) + self.crawled_urls = set() + + + def start_requests(self): + # 连接数据库 + self.conn = mysql.connector.connect(user=MYSQL_USERNAME, password=MYSQL_PASSWORD, host=MYSQL_HOST, database=MYSQL_DATABASE, port=MYSQL_PORT) + self.cursor = self.conn.cursor() + + # 动态生成表名 + dataset_name = f'{self.settings.get("BOT_NAME")}_{self.name}' + + # 获取当前数据库中已经爬取的 URLs + try: + self.cursor.execute(f"SELECT url FROM `{dataset_name}`") + self.crawled_urls = {row[0] for row in self.cursor.fetchall()} + except mysql.connector.Error as err: + if err.errno == errorcode.ER_NO_SUCH_TABLE: + self.log(f"Table `{dataset_name}` does not exist. Initializing crawled URLs as an empty set.") + self.crawled_urls = set() + else: + self.log(f"Error fetching URLs from `{dataset_name}`: {err}") + self.crawled_urls = set() + + # 断开数据库连接 + self.conn.close() + + # 开始请求 + start_urls = ["https://www.dongfang.com/xwzx/jtyw1/qb.htm", "https://www.dongfang.com/xwzx/jcdt.htm", 'https://www.dongfang.com/xwzx/mtzs.htm'] + for url in start_urls: + yield scrapy.Request(url, self.parse) + + + def parse(self, response): + first_news = response.xpath('//div[@class="news_top"]/div[@class="news_img"]/a') + self.log(f'crawled_urls: {self.crawled_urls}') + if first_news: + first_news_url = first_news.attrib['href'] + full_url = urljoin(response.url, first_news_url) + if full_url not in self.crawled_urls: + yield scrapy.Request(full_url, self.news_parse) + + news_list = response.xpath('//div[contains(@class,"swiper-slide")]/dl/dd/a') + for news in news_list: + news_url = news.attrib['href'] + full_url = urljoin(response.url, news_url) + if full_url not in self.crawled_urls: + self.log(f'full_url: {full_url}') + yield scrapy.Request(full_url, self.news_parse) + + next_page = response.xpath('//span[contains(@class, "p_next")]/a') + if next_page: + next_page_url = next_page.attrib['href'] + yield response.follow(next_page_url, self.parse) + + + def news_parse(self, response): + news_item = NewsItem() + news_item['title'] = response.xpath('//div[@class="xq_nr_hd"]/h5/text()').get() + news_item['collection'] = response.xpath('//div[@class="nysubsc"]/ul/li[@class="on"]/a/text()').get() + news_item['url'] = response.url + + news_info = response.xpath('//div[@class="xq_nr_hd"]/span/text()') + news_item['date'] = news_info.re(r'时间:(\d{4}-\d{2}-\d{2}) ')[0] + + source_label = news_info.re(r'来源: (.*) ') + if source_label: + news_item['source'] = source_label[0] + news_item['source_url'] = '' + else: + news_item['source'] = response.xpath('//div[@class="xq_nr_hd"]/span/a/text()').get() + news_item['source_url'] = response.xpath('//div[@class="xq_nr_hd"]/span/a').attrib['href'] + + news_text_list = response.xpath('//div[@class="v_news_content"]/p/text()') + news_item['content'] = '\n'.join([t.get() for t in news_text_list]) + + news_image_urls = response.xpath('//div[@class="v_news_content"]/p/img') + news_item['image_urls'] = ';\n'.join([i.attrib['src'] for i in news_image_urls]) + + yield news_item + diff --git a/scrapy.cfg b/scrapy.cfg new file mode 100644 index 0000000..33de24a --- /dev/null +++ b/scrapy.cfg @@ -0,0 +1,12 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = decspider.settings +shell = ipython + +[deploy] +#url = http://localhost:6800/ +project = decspider