3
0dG:                 @   s   d Z dZdgZddlmZ ddlZddlZddlmZm	Z	m
Z
mZmZ ddlmZmZ ddlmZmZmZmZmZ d	ZG d
d deeZG dd deZdS )zCUse the HTMLParser library to parse HTML files that aren't too bad.MITHTMLParserTreeBuilder    )
HTMLParserN)CDataCommentDeclarationDoctypeProcessingInstruction)EntitySubstitutionUnicodeDammit)DetectsXMLParsedAsHTMLParserRejectedMarkupHTMLHTMLTreeBuilderSTRICTzhtml.parserc               @   s|   e Zd ZdZdZdZdd Zdd Zdd	 ZdddZ	dddZ
dd Zdd Zdd Zdd Zdd Zdd Zdd ZdS ) BeautifulSoupHTMLParserzA subclass of the Python standard library's HTMLParser class, which
    listens for HTMLParser events and translates them into calls
    to Beautiful Soup's tree construction API.
    ignorereplacec             O   s4   |j d| j| _tj| f|| g | _| j  dS )a  Constructor.

        :param on_duplicate_attribute: A strategy for what to do if a
            tag includes the same attribute more than once. Accepted
            values are: REPLACE (replace earlier values with later
            ones, the default), IGNORE (keep the earliest value
            encountered), or a callable. A callable must take three
            arguments: the dictionary of attributes already processed,
            the name of the duplicate attribute, and the most recent value
            encountered.           
        on_duplicate_attributeN)popREPLACEr   r   __init__already_closed_empty_elementZ_initialize_xml_detector)selfargskwargs r   I/var/www/html/virt/lib/python3.6/site-packages/bs4/builder/_htmlparser.pyr   .   s
    	z BeautifulSoupHTMLParser.__init__c             C   s   t |d S )N)r   )r   messager   r   r   errorJ   s    zBeautifulSoupHTMLParser.errorc             C   s   | j ||dd}| j| dS )zHandle an incoming empty-element tag.

        This is only called when the markup looks like <tag/>.

        :param name: Name of the tag.
        :param attrs: Dictionary of the tag's attributes.
        F)handle_empty_elementN)handle_starttaghandle_endtag)r   nameattrstagr   r   r   handle_startendtagZ   s    z*BeautifulSoupHTMLParser.handle_startendtagTc             C   s   i }xh|D ]`\}}|dkrd}||kr^| j }|| jkr8qf|d| jfkrP|||< qf|||| n|||< d}q
W | j \}	}
| jj|dd||	|
d}|r|jr|r| j|dd | jj	| | j
dkr| j| dS )a3  Handle an opening tag, e.g. '<tag>'

        :param name: Name of the tag.
        :param attrs: Dictionary of the tag's attributes.
        :param handle_empty_element: True if this tag is known to be
            an empty-element tag (i.e. there is not expected to be any
            closing tag).
        N z"")
sourceline	sourceposF)check_already_closed)r   IGNOREr   getpossoupr!   Zis_empty_elementr"   r   appendZ	_root_tagZ_root_tag_encountered)r   r#   r$   r    Z	attr_dictkeyvalueZon_dupe	attrvaluer(   r)   r%   r   r   r   r!   i   s,    





z'BeautifulSoupHTMLParser.handle_starttagc             C   s,   |r|| j kr| j j| n| jj| dS )zHandle a closing tag, e.g. '</tag>'
        
        :param name: A tag name.
        :param check_already_closed: True if this tag is expected to
           be the closing portion of an empty-element tag,
           e.g. '<tag></tag>'.
        N)r   remover-   r"   )r   r#   r*   r   r   r   r"      s    	z%BeautifulSoupHTMLParser.handle_endtagc             C   s   | j j| dS )z4Handle some textual data that shows up between tags.N)r-   handle_data)r   datar   r   r   r3      s    z#BeautifulSoupHTMLParser.handle_datac             C   s   |j drt|jdd}n$|j dr8t|jdd}nt|}d}|dk rxR| jjdfD ]B}|sdqZyt|gj|}W qZ tk
r } zW Y dd}~X qZX qZW |syt|}W n& t	t
fk
r } zW Y dd}~X nX |pd}| j| dS )zHandle a numeric character reference by converting it to the
        corresponding Unicode character and treating it as textual
        data.

        :param name: Character number, possibly in hexadecimal.
        x   XN   zwindows-1252u   �)
startswithintlstripr-   original_encoding	bytearraydecodeUnicodeDecodeErrorchr
ValueErrorOverflowErrorr3   )r   r#   Z	real_namer4   encodinger   r   r   handle_charref   s*    

z&BeautifulSoupHTMLParser.handle_charrefc             C   s0   t jj|}|dk	r|}nd| }| j| dS )zHandle a named entity reference by converting it to the
        corresponding Unicode character(s) and treating it as textual
        data.

        :param name: Name of the entity reference.
        Nz&%s)r
   ZHTML_ENTITY_TO_CHARACTERgetr3   )r   r#   	characterr4   r   r   r   handle_entityref   s
    z(BeautifulSoupHTMLParser.handle_entityrefc             C   s&   | j j  | j j| | j jt dS )zOHandle an HTML comment.

        :param data: The text of the comment.
        N)r-   endDatar3   r   )r   r4   r   r   r   handle_comment   s    
z&BeautifulSoupHTMLParser.handle_commentc             C   s6   | j j  |tdd }| j j| | j jt dS )zYHandle a DOCTYPE declaration.

        :param data: The text of the declaration.
        zDOCTYPE N)r-   rI   lenr3   r   )r   r4   r   r   r   handle_decl   s    
z#BeautifulSoupHTMLParser.handle_declc             C   sN   |j  jdr$t}|tdd }nt}| jj  | jj| | jj| dS )z{Handle a declaration of unknown type -- probably a CDATA block.

        :param data: The text of the declaration.
        zCDATA[N)upperr9   r   rK   r   r-   rI   r3   )r   r4   clsr   r   r   unknown_decl  s    
z$BeautifulSoupHTMLParser.unknown_declc             C   s0   | j j  | j j| | j| | j jt dS )z\Handle a processing instruction.

        :param data: The text of the instruction.
        N)r-   rI   r3   Z_document_might_be_xmlr	   )r   r4   r   r   r   	handle_pi  s    

z!BeautifulSoupHTMLParser.handle_piN)T)T)__name__
__module____qualname____doc__r+   r   r   r   r&   r!   r"   r3   rE   rH   rJ   rL   rO   rP   r   r   r   r   r   $   s   
7
(	
r   c                   sN   e Zd ZdZdZdZeZeee	gZ
dZd fdd	ZdddZd	d
 Z  ZS )r   zpA Beautiful soup `TreeBuilder` that uses the `HTMLParser` parser,
    found in the Python standard library.
    FTNc                sp   t  }x&dD ]}||kr|j|}|||< qW tt| jf | |pFg }|pNi }|j| d|d< ||f| _dS )a  Constructor.

        :param parser_args: Positional arguments to pass into 
            the BeautifulSoupHTMLParser constructor, once it's
            invoked.
        :param parser_kwargs: Keyword arguments to pass into 
            the BeautifulSoupHTMLParser constructor, once it's
            invoked.
        :param kwargs: Keyword arguments for the superclass constructor.
        r   Fconvert_charrefsN)r   )dictr   superr   r   updateparser_args)r   rY   Zparser_kwargsr   Zextra_parser_kwargsargr0   )	__class__r   r   r   *  s    


zHTMLParserTreeBuilder.__init__c       	      c   s\   t |tr|dddfV  dS |g}|g}||g}t|||d|d}|j|j|j|jfV  dS )a  Run any preliminary steps necessary to make incoming markup
        acceptable to the parser.

        :param markup: Some markup -- probably a bytestring.
        :param user_specified_encoding: The user asked to try this encoding.
        :param document_declared_encoding: The markup itself claims to be
            in this encoding.
        :param exclude_encodings: The user asked _not_ to try any of
            these encodings.

        :yield: A series of 4-tuples:
         (markup, encoding, declared encoding,
          has undergone character replacement)

         Each 4-tuple represents a strategy for converting the
         document to Unicode and parsing it. Each strategy will be tried 
         in turn.
        NFT)known_definite_encodingsuser_encodingsZis_htmlexclude_encodings)
isinstancestrr   markupr<   Zdeclared_html_encodingZcontains_replacement_characters)	r   ra   Zuser_specified_encodingZdocument_declared_encodingr^   r\   r]   Ztry_encodingsZdammitr   r   r   prepare_markupC  s    
z$HTMLParserTreeBuilder.prepare_markupc             C   sh   | j \}}t||}| j|_y|j| W n* tk
rT } zt|W Y dd}~X nX |j  g |_dS )z{Run some incoming markup through some parsing process,
        populating the `BeautifulSoup` object in self.soup.
        N)rY   r   r-   feedAssertionErrorr   closer   )r   ra   r   r   parserrD   r   r   r   rc   t  s    

zHTMLParserTreeBuilder.feed)NN)NNN)rQ   rR   rS   rT   Zis_xmlZ	picklable
HTMLPARSERNAMEr   r   featuresZTRACKS_LINE_NUMBERSr   rb   rc   __classcell__r   r   )r[   r   r     s   
 
0)rT   __license____all__html.parserr   syswarningsZbs4.elementr   r   r   r   r	   Z
bs4.dammitr
   r   Zbs4.builderr   r   r   r   r   rg   r   r   r   r   r   r   <module>   s   	 z