
\[\-Q                 @   sl  d  Z  d d l Z d d l Z d d l Z d d l m Z d g Z e j d  Z e j d  Z	 e j d  Z
 e j d  Z e j d	  Z e j d
  Z e j d  Z e j d  Z e j d  Z e j d  Z e j d  Z e j d e j  Z e j d e j  Z e j d
  Z e j d  Z Gd d   d e  Z e   Z Gd d   d e j  Z d S)zA parser for HTML and XHTML.    N)unescape
HTMLParserz[&<]z
&[a-zA-Z#]z%&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]z)&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]z	<[a-zA-Z]>z--\s*>z(([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*z$([a-zA-Z][^	
 /> ]*)(?:\s|/(?!>))*zJ\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?z]((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*a  
  <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
  (?:\s+                             # whitespace before attribute name
    (?:[a-zA-Z_][-.:a-zA-Z0-9_]*     # attribute name
      (?:\s*=\s*                     # value indicator
        (?:'[^']*'                   # LITA-enclosed value
          |\"[^\"]*\"                # LIT-enclosed value
          |[^'\">\s]+                # bare value
         )
       )?
     )
   )*
  \s*                                # trailing whitespace
aF  
  <[a-zA-Z][^\t\n\r\f />\x00]*       # tag name
  (?:[\s/]*                          # optional whitespace before attribute name
    (?:(?<=['"\s/])[^\s/>][^\s/=>]*  # attribute name
      (?:\s*=+\s*                    # value indicator
        (?:'[^']*'                   # LITA-enclosed value
          |"[^"]*"                   # LIT-enclosed value
          |(?!['"])[^>\s]*           # bare value
         )
         (?:\s*,)*                   # possibly followed by a comma
       )?(?:\s|/(?!>))*
     )*
   )?
  \s*                                # trailing whitespace
z#</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>c               @   s1   e  Z d  Z d Z d d d  Z d d   Z d S)HTMLParseErrorz&Exception raised for all parse errors.Nc             C   s3   | s t   | |  _ | d |  _ | d |  _ d  S)Nr      )AssertionErrormsglinenooffset)selfr   Zposition r   !/usr/lib/python3.4/html/parser.py__init__U   s    	zHTMLParseError.__init__c             C   sW   |  j  } |  j d  k	 r, | d |  j } n  |  j d  k	 rS | d |  j d } n  | S)Nz, at line %dz, column %dr   )r   r	   r
   )r   resultr   r   r   __str__[   s    	zHTMLParseError.__str__)NN)__name__
__module____qualname____doc__r   r   r   r   r   r   r   R   s   r   c               @   sf  e  Z d  Z d Z d; Z e d e d d Z d d   Z d	 d
   Z d d   Z	 d d   Z
 d Z d d   Z d d   Z d d   Z d d   Z d d   Z d d d  Z d d   Z d d    Z d! d"   Z d# d$   Z d% d&   Z d' d(   Z d) d*   Z d+ d,   Z d- d.   Z d/ d0   Z d1 d2   Z d3 d4   Z d5 d6   Z d7 d8   Z d9 d:   Z  d S)<r   aE  Find tags and other markup and call handler functions.

    Usage:
        p = HTMLParser()
        p.feed(data)
        ...
        p.close()

    Start tags are handled by calling self.handle_starttag() or
    self.handle_startendtag(); end tags by self.handle_endtag().  The
    data between tags is passed from the parser to the derived class
    by calling self.handle_data() with the data as argument (the data
    may be split up in arbitrary chunks).  If convert_charrefs is
    True the character references are converted automatically to the
    corresponding Unicode character (and self.handle_data() is no
    longer split in chunks), otherwise they are passed by calling
    self.handle_entityref() or self.handle_charref() with the string
    containing respectively the named or numeric reference as the
    argument.
    scriptstyleconvert_charrefsc            C   sv   | t  k	 r% t j d t d d n d } | |  _ | t  k r_ d } t j d t d d n  | |  _ |  j   d S)a  Initialize and reset this instance.

        If convert_charrefs is True (default: False), all character references
        are automatically converted to the corresponding Unicode characters.
        If strict is set to False (the default) the parser will parse invalid
        markup, otherwise it will raise an error.  Note that the strict mode
        and argument are deprecated.
        z,The strict argument and mode are deprecated.
stacklevel   FzfThe value of convert_charrefs will become True in 3.5. You are encouraged to set the value explicitly.N)_default_sentinelwarningswarnDeprecationWarningstrictr   reset)r   r   r   r   r   r   r   ~   s    
				zHTMLParser.__init__c             C   s8   d |  _  d |  _ t |  _ d |  _ t j j |   d S)z1Reset this instance.  Loses all unprocessed data. z???N)rawdatalasttaginteresting_normalinteresting
cdata_elem_markupbase
ParserBaser   )r   r   r   r   r      s
    				zHTMLParser.resetc             C   s!   |  j  | |  _  |  j d  d S)zFeed data to the parser.

        Call this as often as you want, with as little or as much text
        as you want (may include '\n').
        r   N)r!   goahead)r   datar   r   r   feed   s    zHTMLParser.feedc             C   s   |  j  d  d S)zHandle any buffered data.r   N)r(   )r   r   r   r   close   s    zHTMLParser.closec             C   s/   t  j d t d d t | |  j     d  S)Nz!The 'error' method is deprecated.r   r   )r   r   r   r   getpos)r   messager   r   r   error   s    	zHTMLParser.errorNc             C   s   |  j  S)z)Return full source of start tag: '<...>'.)_HTMLParser__starttag_text)r   r   r   r   get_starttag_text   s    zHTMLParser.get_starttag_textc             C   s2   | j    |  _ t j d |  j t j  |  _ d  S)Nz</\s*%s\s*>)lowerr%   recompileIr$   )r   elemr   r   r   set_cdata_mode   s    zHTMLParser.set_cdata_modec             C   s   t  |  _ d  |  _ d  S)N)r#   r$   r%   )r   r   r   r   clear_cdata_mode   s    	zHTMLParser.clear_cdata_modec       
      C   s  |  j  } d } t |  } xD| | k  ra|  j rq |  j rq | j d |  } | d k  r | se Pn  | } q n= |  j j | |  } | r | j   } n |  j r Pn  | } | | k  r|  j r |  j r |  j t	 | | |    q|  j | | |   n  |  j
 | |  } | | k r)Pn  | j } | d |  rt j | |  re|  j |  } n | d |  r|  j |  } n | d |  r|  j |  } n | d |  r|  j |  } ng | d |  r|  j r|  j |  } q/|  j |  } n+ | d | k  r.|  j d  | d } n P| d k  r	| sEPn  |  j r^|  j d  n  | j d	 | d  } | d k  r| j d | d  } | d k  r| d } qn
 | d 7} |  j r|  j r|  j t	 | | |    q	|  j | | |   n  |  j
 | |  } q | d
 |  rt j | |  } | r| j   d d  }	 |  j |	  | j   } | d | d  s| d } n  |  j
 | |  } q q^d | | d   k r|  j | | | d   |  j
 | | d  } n  Pq | d |  rLt j | |  } | r| j d  }	 |  j |	  | j   } | d | d  si| d } n  |  j
 | |  } q n  t j | |  } | r| r| j   | | d   k r|  j r|  j d  q| j   } | | k r| } n  |  j
 | | d  } n  Pq^| d | k  rH|  j d  |  j
 | | d  } q^Pq d s t d   q W| r| | k  r|  j r|  j r|  j r|  j t	 | | |    n |  j | | |   |  j
 | |  } n  | | d   |  _  d  S)Nr   <z</z<!--z<?z<!r   zEOF in middle of constructr   z&#r   ;&z#EOF in middle of entity or char refzinteresting.search() lied)r!   lenr   r%   findr$   searchstarthandle_datar   	updatepos
startswithstarttagopenmatchparse_starttagparse_endtagparse_commentparse_pir   Zparse_declarationparse_html_declarationr.   charrefgrouphandle_charrefend	entityrefhandle_entityref
incompleter   )
r   rM   r!   injrD   rB   knamer   r   r   r(      s    		  			
 "		 zHTMLParser.goaheadc             C   s   |  j  } | | | d  d k s/ t d   | | | d  d k rV |  j |  S| | | d  d k r} |  j |  S| | | d  j   d	 k r | j d
 | d  } | d k r d S|  j | | d |   | d S|  j |  Sd  S)Nr   z<!z+unexpected call to parse_html_declaration()   z<!--   z<![	   z	<!doctyper   r   r;   r;   )r!   r   rG   Zparse_marked_sectionr1   r=   handle_declparse_bogus_comment)r   rQ   r!   gtposr   r   r   rI   ;  s    	& z!HTMLParser.parse_html_declarationr   c             C   s   |  j  } | | | d  d k s/ t d   | j d | d  } | d k rU d	 S| ry |  j | | d |   n  | d S)
Nr   <!</z"unexpected call to parse_comment()r   r   )r\   r]   r;   r;   )r!   r   r=   handle_comment)r   rQ   Zreportr!   posr   r   r   rZ   P  s    	&zHTMLParser.parse_bogus_commentc             C   s   |  j  } | | | d  d k s/ t d   t j | | d  } | sO d S| j   } |  j | | d |   | j   } | S)Nr   z<?zunexpected call to parse_pi()r   r;   )r!   r   picloser>   r?   	handle_pirM   )r   rQ   r!   rD   rS   r   r   r   rH   \  s    	&zHTMLParser.parse_pic             C   s  d  |  _  |  j |  } | d k  r( | S|  j } | | |  |  _  g  } |  j rl t j | | d  } n t j | | d  } | s t d   | j   } | j	 d  j
   |  _ } x$| | k  r|  j r t j | |  } n t j | |  } | sPn  | j	 d d d  \ }	 }
 } |
 s2d  } ns | d  d  d k o]| d d   k n s| d  d  d k o| d d   k n r| d d  } n  | rt |  } n  | j |	 j
   | f  | j   } q W| | |  j   } | d k r|  j   \ } } d
 |  j  k r^| |  j  j d
  } t |  j   |  j  j d
  } n | t |  j   } |  j r|  j d | | |  d  d  f  n  |  j | | |   | S| j d	  r|  j | |  n/ |  j | |  | |  j k r|  j |  n  | S)Nr   r   z#unexpected call to parse_starttag()r   rW   '"r   />
z junk characters in start tag: %r   r;   r;   r;   )r   rd   )r/   check_for_whole_start_tagr!   r   tagfindrD   tagfind_tolerantr   rM   rK   r1   r"   attrfindattrfind_tolerantr   appendstripr,   countr<   rfindr.   r@   endswithhandle_startendtaghandle_starttagCDATA_CONTENT_ELEMENTSr6   )r   rQ   endposr!   attrsrD   rT   tagmZattrnamerestZ	attrvaluerM   r	   r
   r   r   r   rE   h  s`    					00		"zHTMLParser.parse_starttagc             C   sk  |  j  } |  j r' t j | |  } n t j | |  } | r[| j   } | | | d  } | d k rs | d S| d k r | j d |  r | d S| j d |  r d S|  j r |  j | | d  |  j d  n  | | k r | S| d Sn  | d k rd S| d k rd S|  j r@|  j | |  |  j d	  n  | | k rP| S| d Sn  t	 d
   d  S)Nr   r   /z/>r   zmalformed empty start tagr    z6abcdefghijklmnopqrstuvwxyz=/ABCDEFGHIJKLMNOPQRSTUVWXYZzmalformed start tagzwe should not get here!r;   r;   r;   )
r!   r   locatestarttagendrD   locatestarttagend_tolerantrM   rB   rA   r.   r   )r   rQ   r!   rw   rS   nextr   r   r   rg     s>    				z$HTMLParser.check_for_whole_start_tagc             C   s  |  j  } | | | d  d k s/ t d   t j | | d  } | sO d	 S| j   } t j | |  } | sW|  j d  k	 r |  j | | |   | S|  j	 r |  j
 d | | |  f  n  t j | | d  } | s| | | d  d k r| d S|  j |  Sn  | j d  j   } | j d | j    } |  j |  | d S| j d  j   } |  j d  k	 r| |  j k r|  j | | |   | Sn  |  j | j    |  j   | S)
Nr   z</zunexpected call to parse_endtagr   zbad end tag: %rrW   z</>r   r;   )r!   r   	endendtagr>   rM   
endtagfindrD   r%   r@   r   r.   ri   rZ   rK   r1   r=   handle_endtagr7   )r   rQ   r!   rD   r[   Z	namematchZtagnamer5   r   r   r   rF     s<    	&	!
zHTMLParser.parse_endtagc             C   s!   |  j  | |  |  j |  d  S)N)rr   r   )r   rv   ru   r   r   r   rq     s    zHTMLParser.handle_startendtagc             C   s   d  S)Nr   )r   rv   ru   r   r   r   rr     s    zHTMLParser.handle_starttagc             C   s   d  S)Nr   )r   rv   r   r   r   r     s    zHTMLParser.handle_endtagc             C   s   d  S)Nr   )r   rU   r   r   r   rL     s    zHTMLParser.handle_charrefc             C   s   d  S)Nr   )r   rU   r   r   r   rO   
  s    zHTMLParser.handle_entityrefc             C   s   d  S)Nr   )r   r)   r   r   r   r@     s    zHTMLParser.handle_datac             C   s   d  S)Nr   )r   r)   r   r   r   r^     s    zHTMLParser.handle_commentc             C   s   d  S)Nr   )r   Zdeclr   r   r   rY     s    zHTMLParser.handle_declc             C   s   d  S)Nr   )r   r)   r   r   r   ra     s    zHTMLParser.handle_pic             C   s$   |  j  r  |  j d | f  n  d  S)Nzunknown declaration: %r)r   r.   )r   r)   r   r   r   unknown_decl  s    	zHTMLParser.unknown_declc             C   s    t  j d t d d t |  S)NzZThe unescape method is deprecated and will be removed in 3.5, use html.unescape() instead.r   r   )r   r   r   r   )r   sr   r   r   r   "  s    	zHTMLParser.unescape)r   r   )!r   r   r   r   rs   r   r   r   r*   r+   r.   r/   r0   r6   r7   r(   rI   rZ   rH   rE   rg   rF   rq   rr   r   rL   rO   r@   r^   rY   ra   r   r   r   r   r   r   r   f   s<   	z<+*)r   r2   r   r&   Zhtmlr   __all__r3   r#   rP   rN   rJ   rC   r`   Zcommentcloserh   ri   rj   rk   VERBOSErz   r{   r}   r~   	Exceptionr   objectr   r'   r   r   r   r   r   <module>   s6   
				