Viewing file: drv_sgmlop_html.py (2.41 KB) -rw-r--r-- Select action/file-type: (+) | (+) | (+) | Code (+) | Session (+) | (+) | SDB (+) | (+) | (+) | (+) | (+) | (+) |
""" SAX2 driver for parsing HTML with the sgmlop parser.
$Id: drv_sgmlop_html.py,v 1.3 2002/05/10 14:50:06 akuchling Exp $ """
version = "0.1"
from drv_sgmlop import * from xml.dom.html import HTML_CHARACTER_ENTITIES, HTML_FORBIDDEN_END, HTML_OPT_END, HTML_DTD from string import strip, upper
class SaxHtmlParser(SaxParser):
def __init__(self, bufsize = 65536, encoding = 'iso-8859-1', verbose = 0): SaxParser.__init__(self, bufsize, encoding) self.verbose = verbose
def finish_starttag(self, tag, attrs): """uses the HTML DTD to automatically generate events for missing tags"""
# guess omitted close tags while self.stack and \ upper(self.stack[-1]) in HTML_OPT_END and \ tag not in HTML_DTD.get(self.stack[-1],[]): self.unknown_endtag(self.stack[-1]) del self.stack[-1]
if self.stack and tag not in HTML_DTD.get(self.stack[-1],[]) and self.verbose: print 'Warning : trying to add %s as a child of %s'%\ (tag,self.stack[-1])
self.unknown_starttag(tag,attrs) if upper(tag) in HTML_FORBIDDEN_END: # close immediately tags for which we won't get an end self.unknown_endtag(tag) return 0 else: self.stack.append(tag) return 1
def finish_endtag(self, tag): if tag in HTML_FORBIDDEN_END : # do nothing: we've already closed it return if tag in self.stack: while self.stack and self.stack[-1] != tag: self.unknown_endtag(self.stack[-1]) del self.stack[-1] self.unknown_endtag(tag) del self.stack[-1] elif self.verbose: print "Warning: I don't see where tag %s was opened"%tag
def handle_data(self,data): if self.stack: if '#PCDATA' not in HTML_DTD.get(self.stack[-1],[]) and not strip(data): # this is probably ignorable whitespace self._cont_handler.ignorableWhitespace(data) else: self._cont_handler.characters(to_xml_string(data,self._encoding))
def close(self): SGMLParser.close(self) self.stack.reverse() for tag in self.stack: self.unknown_endtag(tag) self.stack = [] self._cont_handler.endDocument()
def create_parser(): return SaxHtmlParser()
|