
-c`?E                 @   s  d  Z  d d l Z d d l Z d d l Z d d l m Z d g Z e j d  Z e j d  Z	 e j d  Z
 e j d  Z e j d	  Z e j d
  Z e j d  Z e j d  Z e j d  Z e j d e j  Z e j d
  Z e j d  Z Gd d   d e j  Z d S)zA parser for HTML and XHTML.    N)unescape
HTMLParserz[&<]z
&[a-zA-Z#]z%&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]z)&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]z	<[a-zA-Z]>z--\s*>z$([a-zA-Z][^	
 /> ]*)(?:\s|/(?!>))*z]((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*aF  
  <[a-zA-Z][^\t\n\r\f />\x00]*       # tag name
  (?:[\s/]*                          # optional whitespace before attribute name
    (?:(?<=['"\s/])[^\s/>][^\s/=>]*  # attribute name
      (?:\s*=+\s*                    # value indicator
        (?:'[^']*'                   # LITA-enclosed value
          |"[^"]*"                   # LIT-enclosed value
          |(?!['"])[^>\s]*           # bare value
         )
         (?:\s*,)*                   # possibly followed by a comma
       )?(?:\s|/(?!>))*
     )*
   )?
  \s*                                # trailing whitespace
z#</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>c               @   sW  e  Z d  Z d Z d: Z d d d d  Z d d	   Z d
 d   Z d d   Z d Z	 d d   Z
 d d   Z d d   Z d d   Z d d   Z d d d  Z d d   Z d d   Z d  d!   Z d" d#   Z d$ d%   Z d& d'   Z d( d)   Z d* d+   Z d, d-   Z d. d/   Z d0 d1   Z d2 d3   Z d4 d5   Z d6 d7   Z d8 d9   Z d S);r   aE  Find tags and other markup and call handler functions.

    Usage:
        p = HTMLParser()
        p.feed(data)
        ...
        p.close()

    Start tags are handled by calling self.handle_starttag() or
    self.handle_startendtag(); end tags by self.handle_endtag().  The
    data between tags is passed from the parser to the derived class
    by calling self.handle_data() with the data as argument (the data
    may be split up in arbitrary chunks).  If convert_charrefs is
    True the character references are converted automatically to the
    corresponding Unicode character (and self.handle_data() is no
    longer split in chunks), otherwise they are passed by calling
    self.handle_entityref() or self.handle_charref() with the string
    containing respectively the named or numeric reference as the
    argument.
    scriptstyleconvert_charrefsTc            C   s   | |  _  |  j   d S)zInitialize and reset this instance.

        If convert_charrefs is True (the default), all character references
        are automatically converted to the corresponding Unicode characters.
        N)r   reset)selfr    r
   !/usr/lib/python3.5/html/parser.py__init__W   s    	zHTMLParser.__init__c             C   s8   d |  _  d |  _ t |  _ d |  _ t j j |   d S)z1Reset this instance.  Loses all unprocessed data. z???N)rawdatalasttaginteresting_normalinteresting
cdata_elem_markupbase
ParserBaser   )r	   r
   r
   r   r   `   s
    				zHTMLParser.resetc             C   s!   |  j  | |  _  |  j d  d S)zFeed data to the parser.

        Call this as often as you want, with as little or as much text
        as you want (may include '\n').
        r   N)r   goahead)r	   datar
   r
   r   feedh   s    zHTMLParser.feedc             C   s   |  j  d  d S)zHandle any buffered data.   N)r   )r	   r
   r
   r   closeq   s    zHTMLParser.closeNc             C   s   |  j  S)z)Return full source of start tag: '<...>'.)_HTMLParser__starttag_text)r	   r
   r
   r   get_starttag_textw   s    zHTMLParser.get_starttag_textc             C   s2   | j    |  _ t j d |  j t j  |  _ d  S)Nz</\s*%s\s*>)lowerr   recompileIr   )r	   elemr
   r
   r   set_cdata_mode{   s    zHTMLParser.set_cdata_modec             C   s   t  |  _ d  |  _ d  S)N)r   r   r   )r	   r
   r
   r   clear_cdata_mode   s    	zHTMLParser.clear_cdata_modec             C   s  |  j  } d } t |  } x| | k  r+|  j r |  j r | j d |  } | d k  r | j d t | | d   } | d k r t j d  j	 | |  r P| } n: |  j
 j	 | |  } | r | j   } n |  j r P| } | | k  r<|  j r%|  j r%|  j t | | |    n |  j | | |   |  j | |  } | | k r[P| j } | d |  rt j | |  r|  j |  }	 n | d |  r|  j |  }	 n | d |  r|  j |  }	 nm | d |  r|  j |  }	 nL | d	 |  r|  j |  }	 n+ | d
 | k  rE|  j d  | d
 }	 n P|	 d k  r| sYP| j d | d
  }	 |	 d k  r| j d | d
  }	 |	 d k  r| d
 }	 n
 |	 d
 7}	 |  j r|  j r|  j t | | |	    n |  j | | |	   |  j | |	  } q | d |  rt j | |  } | r| j   d d  }
 |  j |
  | j   }	 | d |	 d
  s|	 d
 }	 |  j | |	  } q q(d | | d   k r|  j | | | d   |  j | | d  } Pq | d |  rt j | |  } | rj| j d
  }
 |  j |
  | j   }	 | d |	 d
  sU|	 d
 }	 |  j | |	  } q t j | |  } | r| r| j   | | d   k r| j   }	 |	 | k r| }	 |  j | | d
  } Pq(| d
 | k  r|  j d  |  j | | d
  } q(Pq d s t d   q W| r| | k  r|  j r|  j r{|  j r{|  j t | | |    n |  j | | |   |  j | |  } | | d   |  _  d  S)Nr   <&"   z[\s;]z</z<!--z<?z<!r   r   z&#   ;zinteresting.search() lied)r   lenr   r   findrfindmaxr   r   searchr   starthandle_datar   Z	updatepos
startswithstarttagopenmatchparse_starttagparse_endtagparse_commentparse_piparse_html_declarationcharrefgrouphandle_charrefend	entityrefhandle_entityref
incompleteAssertionError)r	   r;   r   injZampposr2   r0   knamer
   r
   r   r      s    			  	
 

" zHTMLParser.goaheadc             C   s   |  j  } | | | d  d k s/ t d   | | | d  d k rV |  j |  S| | | d  d k r} |  j |  S| | | d  j   d	 k r | j d
 | d  } | d k r d S|  j | | d |   | d S|  j |  Sd  S)Nr&   z<!z+unexpected call to parse_html_declaration()   z<!--   z<![	   z	<!doctyper   r   r(   r(   )r   r?   r5   Zparse_marked_sectionr   r*   handle_declparse_bogus_comment)r	   r@   r   gtposr
   r
   r   r7      s    	& z!HTMLParser.parse_html_declarationr   c             C   s~   |  j  } | | | d  d k s/ t d   | j d | d  } | d k rU d	 S| rv |  j | | d |   | d S)
Nr&   <!</z"unexpected call to parse_comment()r   r   )rK   rL   r(   r(   )r   r?   r*   handle_comment)r	   r@   Zreportr   posr
   r
   r   rI     s    	&zHTMLParser.parse_bogus_commentc             C   s   |  j  } | | | d  d k s/ t d   t j | | d  } | sO d S| j   } |  j | | d |   | j   } | S)Nr&   z<?zunexpected call to parse_pi()r   r(   )r   r?   picloser-   r.   	handle_pir;   )r	   r@   r   r2   rB   r
   r
   r   r6   !  s    	&zHTMLParser.parse_pic             C   s  d  |  _  |  j |  } | d k  r( | S|  j } | | |  |  _  g  } t j | | d  } | sr t d   | j   } | j d  j   |  _	 } x | | k  rt
 j | |  } | s P| j d d d  \ }	 }
 } |
 s d  } np | d  d  d k o| d d   k n sO| d  d  d k oJ| d d   k n r_| d d  } | rqt |  } | j |	 j   | f  | j   } q W| | |  j   } | d k rC|  j   \ } } d
 |  j  k r| |  j  j d
  } t |  j   |  j  j d
  } n | t |  j   } |  j | | |   | S| j d	  re|  j | |  n, |  j | |  | |  j k r|  j |  | S)Nr   r   z#unexpected call to parse_starttag()r&   rF   '"r   />
r(   r(   r(   )r   rS   )r   check_for_whole_start_tagr   tagfind_tolerantr2   r?   r;   r9   r   r   attrfind_tolerantr   appendstripZgetposcountr)   r+   r/   endswithhandle_startendtaghandle_starttagCDATA_CONTENT_ELEMENTSr!   )r	   r@   endposr   attrsr2   rC   tagmZattrnamerestZ	attrvaluer;   linenooffsetr
   r
   r   r3   -  sR    			00zHTMLParser.parse_starttagc             C   s   |  j  } t j | |  } | r | j   } | | | d  } | d k rU | d S| d k r | j d |  r{ | d S| j d |  r d	 S| | k r | S| d S| d k r d
 S| d k r d S| | k r | S| d St d   d  S)Nr   r   /z/>r&   r   z6abcdefghijklmnopqrstuvwxyz=/ABCDEFGHIJKLMNOPQRSTUVWXYZzwe should not get here!r(   r(   r(   )r   locatestarttagend_tolerantr2   r;   r0   r?   )r	   r@   r   rb   rB   nextr
   r
   r   rU   `  s.    	z$HTMLParser.check_for_whole_start_tagc             C   s  |  j  } | | | d  d k s/ t d   t j | | d  } | sO d S| j   } t j | |  } | s*|  j d  k	 r |  j | | |   | St	 j | | d  } | s | | | d  d k r | d S|  j
 |  S| j d  j   } | j d | j    } |  j |  | d S| j d  j   } |  j d  k	 rx| |  j k rx|  j | | |   | S|  j | j    |  j   | S)	Nr&   z</zunexpected call to parse_endtagr   rF   z</>r   r(   )r   r?   	endendtagr-   r;   
endtagfindr2   r   r/   rV   rI   r9   r   r*   handle_endtagr"   )r	   r@   r   r2   rJ   Z	namematchZtagnamer    r
   r
   r   r4     s8    	&
zHTMLParser.parse_endtagc             C   s!   |  j  | |  |  j |  d  S)N)r]   rk   )r	   ra   r`   r
   r
   r   r\     s    zHTMLParser.handle_startendtagc             C   s   d  S)Nr
   )r	   ra   r`   r
   r
   r   r]     s    zHTMLParser.handle_starttagc             C   s   d  S)Nr
   )r	   ra   r
   r
   r   rk     s    zHTMLParser.handle_endtagc             C   s   d  S)Nr
   )r	   rD   r
   r
   r   r:     s    zHTMLParser.handle_charrefc             C   s   d  S)Nr
   )r	   rD   r
   r
   r   r=     s    zHTMLParser.handle_entityrefc             C   s   d  S)Nr
   )r	   r   r
   r
   r   r/     s    zHTMLParser.handle_datac             C   s   d  S)Nr
   )r	   r   r
   r
   r   rM     s    zHTMLParser.handle_commentc             C   s   d  S)Nr
   )r	   Zdeclr
   r
   r   rH     s    zHTMLParser.handle_declc             C   s   d  S)Nr
   )r	   r   r
   r
   r   rP     s    zHTMLParser.handle_pic             C   s   d  S)Nr
   )r	   r   r
   r
   r   unknown_decl  s    zHTMLParser.unknown_declc             C   s    t  j d t d d t |  S)NzZThe unescape method is deprecated and will be removed in 3.5, use html.unescape() instead.
stacklevelr&   )warningswarnDeprecationWarningr   )r	   sr
   r
   r   r     s    	zHTMLParser.unescape)r   r   )__name__
__module____qualname____doc__r^   r   r   r   r   r   r   r!   r"   r   r7   rI   r6   r3   rU   r4   r\   r]   rk   r:   r=   r/   rM   rH   rP   rl   r   r
   r
   r
   r   r   ?   s8   		z3"()ru   r   rn   r   Zhtmlr   __all__r   r   r>   r<   r8   r1   rO   ZcommentcloserV   rW   VERBOSErg   ri   rj   r   r   r
   r
   r
   r   <module>   s(   
		