
    s<gN              
          d dl mZ d dlmZ d dlmZmZ ddlmZm	Z	m
Z
 ddlmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZ  G d d      Z G d	 d
e      Z G d de      Z G d de      Z  G d de      Z! G d de      Z" G d de      Z# G d de      Z$ G d de      Z% G d de      Z& ed      dee'   dee'   de(fd        Z) ed!      	 d'd"e'd#e*d$e(de*fd%       Z+y&)(    )	lru_cache)	getLogger)ListOptional   )COMMON_SAFE_ASCII_CHARACTERSTRACEUNICODE_SECONDARY_RANGE_KEYWORD)is_accentuated	is_arabicis_arabic_isolated_formis_case_variableis_cjkis_emoticon	is_hangulis_hiraganais_katakanais_latinis_punctuationis_separator	is_symbolis_thaiis_unprintableremove_accentunicode_rangec                   N    e Zd ZdZdedefdZdeddfdZd	dZe	de
fd       Zy)
MessDetectorPluginzy
    Base abstract class used for mess detection plugins.
    All detectors MUST extend and implement given methods.
    	characterreturnc                     t         )z@
        Determine if given character should be fed in.
        NotImplementedErrorselfr   s     S/home/kevstigneev/proxy/myenv/lib/python3.12/site-packages/charset_normalizer/md.pyeligiblezMessDetectorPlugin.eligible%   
     "!    Nc                     t         )z
        The main routine to be executed upon character.
        Insert the logic in witch the text would be considered chaotic.
        r!   r#   s     r%   feedzMessDetectorPlugin.feed+   s
    
 "!r(   c                     t         )zB
        Permit to reset the plugin to the initial state.
        r!   r$   s    r%   resetzMessDetectorPlugin.reset2   r'   r(   c                     t         )z
        Compute the chaos ratio based on what your feed() has seen.
        Must NOT be lower than 0.; No restriction gt 0.
        r!   r,   s    r%   ratiozMessDetectorPlugin.ratio8   s
     "!r(   r   N)__name__
__module____qualname____doc__strboolr&   r*   r-   propertyfloatr/    r(   r%   r   r      sM    
"# "$ ""c "d "" "u " "r(   r   c                   R    e Zd Zd	dZdedefdZdeddfdZd	dZe	de
fd       Zy)
 TooManySymbolOrPunctuationPluginr   Nc                 J    d| _         d| _        d| _        d | _        d| _        y )Nr   F)_punctuation_count_symbol_count_character_count_last_printable_char_frenzy_symbol_in_wordr,   s    r%   __init__z)TooManySymbolOrPunctuationPlugin.__init__B   s*    '("#%&37!,1#r(   r   c                 "    |j                         S Nisprintabler#   s     r%   r&   z)TooManySymbolOrPunctuationPlugin.eligibleJ       $$&&r(   c                 8   | xj                   dz  c_         || j                  k7  ro|t        vrgt        |      r| xj                  dz  c_        || _        y |j                         du r-t        |      r"t        |      du r| xj                  dz  c_        || _        y )Nr   F   )	r?   r@   r   r   r=   isdigitr   r   r>   r#   s     r%   r*   z%TooManySymbolOrPunctuationPlugin.feedM   s    " 222!==i(''1,' %.! !!#u,i(	*e3""a'"$-!r(   c                 .    d| _         d| _        d| _        y Nr   )r=   r?   r>   r,   s    r%   r-   z&TooManySymbolOrPunctuationPlugin.reset_   s    "# !r(   c                     | j                   dk(  ry| j                  | j                  z   | j                   z  }|dk\  r|S dS )Nr           333333?)r?   r=   r>   )r$   ratio_of_punctuations     r%   r/   z&TooManySymbolOrPunctuationPlugin.ratiod   sO      A% ##d&8&88!!'" (<s'B#KKr(   r0   r1   r2   r3   rB   r5   r6   r&   r*   r-   r7   r8   r/   r9   r(   r%   r;   r;   A   sP    2'# '$ '.c .d .$
 Lu L Lr(   r;   c                   R    e Zd Zd	dZdedefdZdeddfdZd	dZe	de
fd       Zy)
TooManyAccentuatedPluginr   Nc                      d| _         d| _        y rL   r?   _accentuated_countr,   s    r%   rB   z!TooManyAccentuatedPlugin.__init__q   s    %&'(r(   r   c                 "    |j                         S rD   )isalphar#   s     r%   r&   z!TooManyAccentuatedPlugin.eligibleu   s      ""r(   c                 p    | xj                   dz  c_         t        |      r| xj                  dz  c_        y y Nr   )r?   r   rV   r#   s     r%   r*   zTooManyAccentuatedPlugin.feedx   s1    ")$##q(# %r(   c                      d| _         d| _        y rL   rU   r,   s    r%   r-   zTooManyAccentuatedPlugin.reset~   s     !"#r(   c                 f    | j                   dk  ry| j                  | j                   z  }|dk\  r|S dS )N   rN   gffffff?rU   )r$   ratio_of_accentuations     r%   r/   zTooManyAccentuatedPlugin.ratio   s=      1$'+'>'>AVAV'V(=(E$N3Nr(   r0   rQ   r9   r(   r%   rS   rS   p   sP    )## #$ #)c )d )$ Ou O Or(   rS   c                   R    e Zd Zd	dZdedefdZdeddfdZd	dZe	de
fd       Zy)
UnprintablePluginr   Nc                      d| _         d| _        y rL   )_unprintable_countr?   r,   s    r%   rB   zUnprintablePlugin.__init__   s    '(%&r(   r   c                      yNTr9   r#   s     r%   r&   zUnprintablePlugin.eligible       r(   c                 n    t        |      r| xj                  dz  c_        | xj                  dz  c_        y rZ   )r   rb   r?   r#   s     r%   r*   zUnprintablePlugin.feed   s,    )$##q(#"r(   c                     d| _         y rL   )rb   r,   s    r%   r-   zUnprintablePlugin.reset   s
    "#r(   c                 Z    | j                   dk(  ry| j                  dz  | j                   z  S )Nr   rN   r]   )r?   rb   r,   s    r%   r/   zUnprintablePlugin.ratio   s/      A%''!+t/D/DDDr(   r0   rQ   r9   r(   r%   r`   r`      sP    '# $ #c #d #
$ Eu E Er(   r`   c                   R    e Zd Zd	dZdedefdZdeddfdZd	dZe	de
fd       Zy)
SuspiciousDuplicateAccentPluginr   Nc                 .    d| _         d| _        d | _        y rL   _successive_countr?   _last_latin_characterr,   s    r%   rB   z(SuspiciousDuplicateAccentPlugin.__init__   s    &'%&48"r(   r   c                 <    |j                         xr t        |      S rD   )rX   r   r#   s     r%   r&   z(SuspiciousDuplicateAccentPlugin.eligible   s      ":x	'::r(   c                 ~   | xj                   dz  c_         | j                  t        |      rt        | j                        ru|j                         r/| j                  j                         r| xj                  dz  c_        t        |      t        | j                        k(  r| xj                  dz  c_        || _        y rZ   )r?   rn   r   isupperrm   r   r#   s     r%   r*   z$SuspiciousDuplicateAccentPlugin.feed   s    "&&2y)t99:  "t'A'A'I'I'K&&!+&Y'=9S9S+TT&&!+&%."r(   c                 .    d| _         d| _        d | _        y rL   rl   r,   s    r%   r-   z%SuspiciousDuplicateAccentPlugin.reset   s    !" !%)"r(   c                 Z    | j                   dk(  ry| j                  dz  | j                   z  S )Nr   rN   rI   )r?   rm   r,   s    r%   r/   z%SuspiciousDuplicateAccentPlugin.ratio   s/      A%&&*d.C.CCCr(   r0   rQ   r9   r(   r%   rj   rj      sP    9;# ;$ ;/c /d /*
 Du D Dr(   rj   c                   R    e Zd Zd	dZdedefdZdeddfdZd	dZe	de
fd       Zy)
SuspiciousRanger   Nc                 .    d| _         d| _        d | _        y rL   )"_suspicious_successive_range_countr?   _last_printable_seenr,   s    r%   rB   zSuspiciousRange.__init__   s    78/%&37!r(   r   c                 "    |j                         S rD   rE   r#   s     r%   r&   zSuspiciousRange.eligible   rG   r(   c                 <   | xj                   dz  c_         |j                         st        |      s|t        v rd | _        y | j                  || _        y t        | j                        }t        |      }t        ||      r| xj                  dz  c_        || _        y rZ   )r?   isspacer   r   rx   r    is_suspiciously_successive_rangerw   )r$   r   unicode_range_aunicode_range_bs       r%   r*   zSuspiciousRange.feed   s    " i(88(,D%$$,(1D%)6t7P7P)Q)6y)A+O_M33q83$-!r(   c                 .    d| _         d| _        d | _        y rL   )r?   rw   rx   r,   s    r%   r-   zSuspiciousRange.reset   s     !23/$(!r(   c                 ^    | j                   dk  ry| j                  dz  | j                   z  }|S )N   rN   rI   )r?   rw   )r$   ratio_of_suspicious_range_usages     r%   r/   zSuspiciousRange.ratio   s<      B& 33a7!!2"' /.r(   r0   rQ   r9   r(   r%   ru   ru      sM    8
'# '$ '.c .d ..)
 /u / /r(   ru   c                   R    e Zd Zd	dZdedefdZdeddfdZd	dZe	de
fd       Zy)
SuperWeirdWordPluginr   Nc                     d| _         d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _	        y )Nr   F )
_word_count_bad_word_count_foreign_long_count_is_current_word_bad_foreign_long_watchr?   _bad_character_count_buffer_buffer_accent_count_buffer_glyph_countr,   s    r%   rB   zSuperWeirdWordPlugin.__init__   sQ     !$%() */!). %&)*!)*!() r(   r   c                      yrd   r9   r#   s     r%   r&   zSuperWeirdWordPlugin.eligible	  re   r(   c                    |j                         r| xj                  |z  c_        t        |      r| xj                  dz  c_        | j                  du r`t        |      du st        |      rHt        |      du r;t        |      du r.t        |      du r!t        |      du rt        |      du rd| _        t        |      s,t        |      s!t        |      st        |      st        |      r| xj                  dz  c_        y | j                  sy |j                         st        |      st        |      r| j                  r| xj                  dz  c_        t!        | j                        }| xj"                  |z  c_        |dk\  r| j                  |z  dk\  rd| _        nt        | j                  d         rX| j                  d   j'                         r;t)        d | j                  D              du r| xj*                  dz  c_        d| _        n+| j                  dk(  rd| _        | xj*                  dz  c_        |dk\  r| j                  rwt-        | j                  t/        d	|            D cg c]  \  }}|j'                         r| }}}d}|rt!        |      |z  d
k  rd}|s| xj*                  dz  c_        d| _        | j$                  rD| xj0                  dz  c_        | xj2                  t!        | j                        z  c_        d| _        d| _        d| _        d	| _        d	| _        y |dvr<|j5                         du r)t7        |      rd| _        | xj                  |z  c_        y y y y c c}}w )Nr   FT   g      ?c              3   <   K   | ]  }|j                           y wrD   )rq   ).0_s     r%   	<genexpr>z,SuperWeirdWordPlugin.feed.<locals>.<genexpr>6  s     >AAIIK>s      r   rO   r   >   r   -<=>|~)rX   r   r   r   r   r   r   r   r   r   r   r   r{   r   r   r   lenr?   r   rq   allr   zipranger   r   rJ   r   )r$   r   buffer_lengthcicamel_case_dstprobable_camel_caseds          r%   r*   zSuperWeirdWordPlugin.feed  s   LLI%Li())Q.)((E1i(E1^I5N9%.i(E1	*e3	*e3I&%/+/(y!Y'y)y)9%((A-(||>)#<Y@Wll!!$T\\!2M!!]2!!,,}<C04D- #4<<#34R(002>>>%G,,1,04D---204D-,,1,"t'?'? !$DLL%=2I J"1yy{ " "
 .3$!s>':]'Jc'Q+/(+,,1,04D-(($$)$))S->>),1)',D$DL()D%'(D$@@!!#u,)$(,D%LLI%L % - A1"s   /M1c                 t    d| _         d| _        d| _        d| _        d| _        d| _        d| _        d| _        y )Nr   Fr   )r   r   r   r   r   r?   r   r   r,   s    r%   r-   zSuperWeirdWordPlugin.reset]  sA    $)!#(   !$%!#$ r(   c                 r    | j                   dk  r| j                  dk(  ry| j                  | j                  z  S )N
   r   rN   )r   r   r   r?   r,   s    r%   r/   zSuperWeirdWordPlugin.ratiog  s7    r!d&>&>!&C((4+@+@@@r(   r0   rQ   r9   r(   r%   r   r      sT    *# $ O&c O&d O&b% Au A Ar(   r   c                   V    e Zd ZdZd
dZdedefdZdeddfdZd
dZ	e
defd	       Zy)CjkInvalidStopPluginu   
    GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
    can be easily detected. Searching for the overuse of '丅' and '丄'.
    r   Nc                      d| _         d| _        y rL   _wrong_stop_count_cjk_character_countr,   s    r%   rB   zCjkInvalidStopPlugin.__init__u  s    &')*!r(   r   c                      yrd   r9   r#   s     r%   r&   zCjkInvalidStopPlugin.eligibley  re   r(   c                 z    |dv r| xj                   dz  c_         y t        |      r| xj                  dz  c_        y y )N>      丄   丅r   )r   r   r   r#   s     r%   r*   zCjkInvalidStopPlugin.feed|  s<    &""a'")%%*% r(   c                      d| _         d| _        y rL   r   r,   s    r%   r-   zCjkInvalidStopPlugin.reset  s    !"$%!r(   c                 T    | j                   dk  ry| j                  | j                   z  S )N   rN   )r   r   r,   s    r%   r/   zCjkInvalidStopPlugin.ratio  s*    $$r)%%(A(AAAr(   r0   )r1   r2   r3   r4   rB   r5   r6   r&   r*   r-   r7   r8   r/   r9   r(   r%   r   r   o  sU    
+# $ +c +d +& Bu B Br(   r   c                   R    e Zd Zd	dZdedefdZdeddfdZd	dZe	de
fd       Zy)
ArchaicUpperLowerPluginr   Nc                 f    d| _         d| _        d| _        d| _        d| _        d | _        d| _        y )NFr   T)_buf_character_count_since_last_sep_successive_upper_lower_count#_successive_upper_lower_count_finalr?   _last_alpha_seen_current_ascii_onlyr,   s    r%   rB   z ArchaicUpperLowerPlugin.__init__  s9    	45,23*890%&/3)- r(   r   c                      yrd   r9   r#   s     r%   r&   z ArchaicUpperLowerPlugin.eligible  re   r(   c                 Z   |j                         xr t        |      }|du }|r| j                  dkD  r| j                  dk  r?|j                         du r-| j                  du r| xj
                  | j                  z  c_        d| _        d| _        d | _        d| _        | xj                  dz  c_	        d| _        y | j                  du r|j                         du rd| _        | j                  |j                         r| j                  j                         s*|j                         rM| j                  j                         r3| j                  du r| xj                  dz  c_        d| _        nd| _        nd| _        | xj                  dz  c_	        | xj                  dz  c_        || _        y )NFr   @   r   TrI   )rX   r   r   rJ   r   r   r   r   r   r?   isasciirq   islower)r$   r   is_concerned	chunk_seps       r%   r*   zArchaicUpperLowerPlugin.feed  s    ((*J/?	/J E)	==A44:%%'50,,588668 23D.34D0$(D!DI!!Q&!'+D$##t+	0A0A0Cu0L',D$  ,!!#(=(=(E(E(G!!#(=(=(E(E(G99$66!;6 %DI $DI!	",,1, )r(   c                 f    d| _         d| _        d| _        d| _        d | _        d| _        d| _        y )Nr   FT)r?   r   r   r   r   r   r   r,   s    r%   r-   zArchaicUpperLowerPlugin.reset  s9     !/0,-.*340 $	#' r(   c                 T    | j                   dk(  ry| j                  | j                   z  S )Nr   rN   )r?   r   r,   s    r%   r/   zArchaicUpperLowerPlugin.ratio  s*      A%77$:O:OOOr(   r0   rQ   r9   r(   r%   r   r     sQ    .# $ (*c (*d (*T( Pu P Pr(   r   c                   R    e Zd Zd	dZd	dZdedefdZdeddfdZe	de
fd       Zy)
ArabicIsolatedFormPluginr   Nc                      d| _         d| _        y rL   r?   _isolated_form_countr,   s    r%   rB   z!ArabicIsolatedFormPlugin.__init__  s    %&)*!r(   c                      d| _         d| _        y rL   r   r,   s    r%   r-   zArabicIsolatedFormPlugin.reset  s     !$%!r(   r   c                     t        |      S rD   )r   r#   s     r%   r&   z!ArabicIsolatedFormPlugin.eligible  s    ##r(   c                 p    | xj                   dz  c_         t        |      r| xj                  dz  c_        y y rZ   )r?   r   r   r#   s     r%   r*   zArabicIsolatedFormPlugin.feed  s1    ""9-%%*% .r(   c                 X    | j                   dk  ry| j                  | j                   z  }|S )Nr]   rN   r   )r$   isolated_form_usages     r%   r/   zArabicIsolatedFormPlugin.ratio  s0      1$%)%>%>AVAV%V""r(   r0   )r1   r2   r3   rB   r-   r5   r6   r&   r*   r7   r8   r/   r9   r(   r%   r   r     sM    +&$# $$ $+c +d + #u # #r(   r      )maxsizer}   r~   r   c                    | |y| |k(  ryd| v rd|v ryd| v sd|v ryd| v sd|v r	d| v sd|v ry| j                  d      |j                  d      }}|D ]  }|t        v r||v s y | dv |dv }}|s|r	d| v sd|v ry|r|ryd	| v sd	|v rd| v sd|v ry| d
k(  s|d
k(  ryd| v sd|v s| dv r!|dv rd| v sd|v ryd| v sd|v ry| d
k(  s|d
k(  ryy)za
    Determine if two Unicode range seen next to each other can be considered as suspicious.
    TFLatin	Emoticons	Combining )HiraganaKatakanaCJKHangulzBasic Latin)r   r   PunctuationForms)splitr
   )r}   r~   keywords_range_akeywords_range_belrange_a_jp_charsrange_b_jp_charss          r%   r|   r|     s    /"9/)/!g&@o%)G 	?"g&@&+*H)8)>)>*S! '  00!!	 	
	

 	33 ' 	, E_$<,?"h/&AO#u'?m+-/O 	 E_$<3377O+}/Oo%O)Cm+-/Or(   i   decoded_sequencemaximum_thresholddebugc           	         t         j                         D cg c]	  } |        }}t        |       dz   }d}|dk  rd}n
|dk  rd}nd}t        | dz   t	        |            D ]^  \  }}	|D ]%  }
|
j                  |      s|
j                  |       ' |	d	kD  r|	|z  d	k(  s	|	|dz
  k(  sFt        d
 |D              }||k\  s^ n |rt        d      }|j                  t        d| d| d|        t        |       dkD  r8|j                  t        d| dd         |j                  t        d| dd         |D ]1  }|j                  t        |j                   d|j                          3 t        |d      S c c}w )zw
    Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
    r   rN   i       r   r      
r   c              3   4   K   | ]  }|j                     y wrD   )r/   )r   dts     r%   r   zmess_ratio.<locals>.<genexpr>^  s     !?r"((!?s   charset_normalizerzIMess-detector extended-analysis start. intermediary_mean_mess_ratio_calc=z mean_mess_ratio=z maximum_threshold=r   zStarting with: NzEnding with: iz:    )r   __subclasses__r   r   r   r&   r*   sumr   logr	   	__class__r/   round)r   r   r   md_class	detectorslengthmean_mess_ratio!intermediary_mean_mess_ratio_calcr   indexdetectorloggerr   s                r%   
mess_ratior   ?  s    $6#D#D#F+
+I + &'!+F O|13)	4,.),/) 04 7vG 	5! 	)H  +i(	)
 AI%"CCqHfqj !!?Y!??O"33 /0

11R0SSdetdu v!!2 35	
  2%JJu0@"0E/FGHJJu.>su.E-FGH 	=BJJub
;<	= !$$[+s   E6N)g?F),	functoolsr   loggingr   typingr   r   constantr   r	   r
   utilsr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r;   rS   r`   rj   ru   r   r   r   r   r5   r6   r|   r8   r   r9   r(   r%   <module>r     sC     ! 
    *" "D,L'9 ,L^O1 O6E* E0"D&8 "DJ./( ./bsA- sAlB- B>IP0 IPX#1 #8 4Ec]E5=c]E	E EP 4IN4%4%.34%BF4%
4% 4%r(   