Difference between revisions of "Python - HTML Parsers"
Jump to navigation
Jump to search
PeterHarding (talk | contribs) (New page: =parsers= See BeautifulSoup ... ==Examples== <pre> #!/usr/bin/env python import re f_in = open("BO_1001_Body_01.txt", "r") data = f_in.read() s = re.compile('(<[^ >]*)') e = re.comp...) |
PeterHarding (talk | contribs) |
||
Line 10: | Line 10: | ||
import re | import re | ||
f_in = open(" | f_in = open("Some.html", "r") | ||
data = f_in.read() | data = f_in.read() |
Latest revision as of 18:35, 23 April 2009
parsers
See BeautifulSoup ...
Examples
#!/usr/bin/env python import re f_in = open("Some.html", "r") data = f_in.read() s = re.compile('(<[^ >]*)') e = re.compile('>') idx = 0 cnt = 0 l = len(data) - 2 tag_end = None while True: m = s.search(data[idx:]) if m: # print m.group(0) start = m.start(0) offset = m.end(0) # print idx, start, offset if tag_end: # print "%s %s" % (tag_end, idx + start) if idx + start != tag_end: # have some data! text = data[tag_end:idx + start] print "%s" % text n = e.search(data[idx + offset:]) if n: tag_end = idx + offset+ n.end(0) print data[idx + start:tag_end] else: rest = data[idx + offset:] print "No end tag" print "Rest: [%s]" % rest break else: rest = data[idx:] print "Rest: [%s]" % rest break idx = tag_end # print data[idx:idx + 80] cnt += 1 f_in.close()