o
    2gOO                     @   s  d dl Z d dlmZmZ d dlmZmZmZmZ zd dl	m
Z
 W n ey+   eZ
Y nw ddlmZmZmZmZ ddlmZmZmZmZ ddlmZ dd	lmZmZ dd
lmZmZmZm Z m!Z!m"Z" e #dZ$e % Z&e&'e (d 							d%de)de*de*de+dee dee de,de,defddZ-							d%dede*de*de+dee dee de,de,defddZ.							d%d e
de*de*de+dee dee de,de,defd!d"Z/						d&d e
de*de*de+dee dee de,defd#d$Z0dS )'    N)basenamesplitext)BinaryIOListOptionalSet)PathLike   )coherence_ratioencoding_languagesmb_encoding_languagesmerge_coherence_ratios)IANA_SUPPORTEDTOO_BIG_SEQUENCETOO_SMALL_SEQUENCETRACE)
mess_ratio)CharsetMatchCharsetMatches)any_specified_encoding	iana_nameidentify_sig_or_bomis_cp_similaris_multi_byte_encodingshould_strip_sig_or_bomcharset_normalizerz)%(asctime)s | %(levelname)s | %(message)s      皙?TF	sequencessteps
chunk_size	thresholdcp_isolationcp_exclusionpreemptive_behaviourexplainreturnc           1      C   s	  t | ttfstdt| |rtj}tt	 t
t t| }	|	dkrGtd |r;tt	 t
|p9tj tt| dddg dgS |dur]ttd	d
| dd |D }ng }|duruttdd
| dd |D }ng }|	|| krttd|||	 d}|	}|dkr|	| |k rt|	| }t| tk }
t| tk}|
rttd|	 n|rttd|	 g }|rt| nd}|dur|| ttd| t }g }g }d}d}d}t }t| \}}|dur|| ttdt|| |d d|vr|d |t D ],}|r!||vr!q|r+||v r+q||v r2q|| d}||k}|oCt|}|dv rU|sUttd| qzt|}W n t t!fyo   ttd| Y qw z9|r|du rt"|du r| dtd n	| t|td |d nt"|du r| n| t|d |d}W n+ t#t$fy } zt |t$sttd|t"| || W Y d}~qd}~ww d}|D ]}t%||rd} nq|rttd|| qt&|sdnt||	t|	| }|o|duot||	k } | rttd| tt|d }!t'|!d }!d}"d}#g }$g }%|D ]}&|&| |	d! krAq4| |&|&|  }'|rU|du rU||' }'z|'j(||r^d"nd#d$}(W n" t#y } zttd%|t"| |!}"d}#W Y d}~ nd}~ww |r|&dkr| |& d&krt)|d'})|r|(d|) |vrt&|&|&d d(D ])}*| |*|&|  }'|r|du r||' }'|'j(|d"d$}(|(d|) |v r nq|$|( |%t*|(| |%d( |kr|"d7 }"|"|!ks|r|du r nq4|#s=|r=|s=z| td)d j(|d#d$ W n# t#y< } zttd*|t"| || W Y d}~qd}~ww |%rHt+|%t|% nd}+|+|ksT|"|!kr|| ttd+||"t,|+d, d-d. |dd|fv r|#st| ||dg |},||kr|,}n
|dkr|,}n|,}qttd/|t,|+d, d-d. |st-|}-nt.|}-|-rttd0|t"|- g }.|dkr|$D ]}(t/|(d1|-rd2|-nd}/|.|/ qt0|.}0|0rttd3|0| |t| ||+||0| ||ddfv r|+d1k rtd4| |rtt	 t
| t|| g  S ||kr@td5| |r7tt	 t
| t|| g  S qt|dkr|sR|sR|rXttd6 |rhtd7|j1 || n2|rp|du s|r}|r}|j2|j2ks|durtd8 || n|rtd9 || |rtd:|3 j1t|d  ntd; |rtt	 t
| |S )<ae  
    Given a raw bytes sequence, return the best possibles charset usable to render str objects.
    If there is no results, it is a strong indicator that the source is binary/not text.
    By default, the process will extract 5 blocs of 512o each to assess the mess and coherence of a given sequence.
    And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.

    The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
    but never take it for granted. Can improve the performance.

    You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
    purpose.

    This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
    By default the library does not setup any handler other than the NullHandler, if you choose to set the 'explain'
    toggle to True it will alter the logger configuration to add a StreamHandler that is suitable for debugging.
    Custom logging format and handler can be set manually.
    z4Expected object of type bytes or bytearray, got: {0}r   z<Encoding detection on empty bytes, assuming utf_8 intention.utf_8g        F Nz`cp_isolation is set. use this flag for debugging purpose. limited list of encoding allowed : %s.z, c                 S      g | ]}t |d qS Fr   .0cp r0   e/var/www/mastermindingenieria.com/MONITOR/venv/lib/python3.10/site-packages/charset_normalizer/api.py
<listcomp>]       zfrom_bytes.<locals>.<listcomp>zacp_exclusion is set. use this flag for debugging purpose. limited list of encoding excluded : %s.c                 S   r*   r+   r,   r-   r0   r0   r1   r2   h   r3   z^override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.r	   z>Trying to detect encoding from a tiny portion of ({}) byte(s).zIUsing lazy str decoding because the payload is quite large, ({}) byte(s).z@Detected declarative mark in sequence. Priority +1 given for %s.zIDetected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.ascii>   utf_16utf_32z[Encoding %s wont be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.z2Encoding %s does not provide an IncrementalDecoderg    A)encodingz9Code page %s does not fit given bytes sequence at ALL. %sTzW%s is deemed too similar to code page %s and was consider unsuited already. Continuing!zpCode page %s is a multi byte encoding table and it appear that at least one character was encoded using n-bytes.         ignorestrict)errorszaLazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s      g     j@z^LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %szc%s was excluded because of initial chaos probing. Gave up %i time(s). Computed mean chaos is %f %%.d      )ndigitsz=%s passed initial chaos probing. Mean measured chaos is %f %%z&{} should target any language(s) of {}g?,z We detected language {} using {}z.Encoding detection: %s is most likely the one.zoEncoding detection: %s is most likely the one as we detected a BOM or SIG within the beginning of the sequence.zONothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.z7Encoding detection: %s will be used as a fallback matchz:Encoding detection: utf_8 will be used as a fallback matchz:Encoding detection: ascii will be used as a fallback matchz]Encoding detection: Found %s as plausible (best-candidate) for content. With %i alternatives.z=Encoding detection: Unable to determine any suitable charset.)4
isinstance	bytearraybytes	TypeErrorformattypeloggerlevel
addHandlerexplain_handlersetLevelr   lendebugremoveHandlerloggingWARNINGr   r   logjoinintr   r   r   appendsetr   r   addr   r   ModuleNotFoundErrorImportErrorstrUnicodeDecodeErrorLookupErrorr   rangemaxdecodeminr   sumroundr   r   r
   r   r7   fingerprintbest)1r   r    r!   r"   r#   r$   r%   r&   previous_logger_levellengthis_too_small_sequenceis_too_large_sequenceprioritized_encodingsspecified_encodingtestedtested_but_hard_failuretested_but_soft_failurefallback_asciifallback_u8fallback_specifiedresultssig_encodingsig_payloadencoding_ianadecoded_payloadbom_or_sig_availablestrip_sig_or_bomis_multi_byte_decoderesimilar_soft_failure_testencoding_soft_failedr_multi_byte_bonusmax_chunk_gave_upearly_stop_countlazy_str_hard_failure	md_chunks	md_ratiosicut_sequencechunkchunk_partial_size_chkjmean_mess_ratiofallback_entrytarget_languages	cd_ratioschunk_languagescd_ratios_mergedr0   r0   r1   
from_bytes%   s  












































r   fpc              	   C   s   t |  |||||||S )z
    Same thing than the function from_bytes but using a file pointer that is already ready.
    Will not close the file pointer.
    )r   read)r   r    r!   r"   r#   r$   r%   r&   r0   r0   r1   from_fp  s   r   pathc           	   
   C   sD   t | d}t||||||||W  d   S 1 sw   Y  dS )z
    Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
    Can raise IOError.
    rbN)openr   )	r   r    r!   r"   r#   r$   r%   r&   r   r0   r0   r1   	from_path  s   $r   c              	   C   s   t | ||||||}t| }tt|}	t|dkr!td|| }
|	d  d|
j 7  < t	dt
| |d|	d}||
  W d   |
S 1 sTw   Y  |
S )zi
    Take a (text-based) file path and try to create another file next to it, this time using UTF-8.
    r   z;Unable to normalize "{}", no encoding charset seems to fit.-z{}r)   wbN)r   r   listr   rP   IOErrorrI   rg   r7   r   r]   replacerV   writeoutput)r   r    r!   r"   r#   r$   r%   rt   filenametarget_extensionsresultr   r0   r0   r1   	normalize7  s8   

r   )r   r   r   NNTF)r   r   r   NNT)1rS   os.pathr   r   typingr   r   r   r   osr   r\   r]   cdr
   r   r   r   constantr   r   r   r   mdr   modelsr   r   utilsr   r   r   r   r   r   	getLoggerrK   StreamHandlerrN   setFormatter	FormatterrG   rW   floatboolr   r   r   r   r0   r0   r0   r1   <module>   s     
	
   b	
	
