Python - HTML Parsers
Jump to navigation
Jump to search
parsers
See BeautifulSoup ...
Examples
#!/usr/bin/env python
import re
f_in = open("Some.html", "r")
data = f_in.read()
s = re.compile('(<[^ >]*)')
e = re.compile('>')
idx = 0
cnt = 0
l = len(data) - 2
tag_end = None
while True:
m = s.search(data[idx:])
if m:
# print m.group(0)
start = m.start(0)
offset = m.end(0)
# print idx, start, offset
if tag_end:
# print "%s %s" % (tag_end, idx + start)
if idx + start != tag_end: # have some data!
text = data[tag_end:idx + start]
print "%s" % text
n = e.search(data[idx + offset:])
if n:
tag_end = idx + offset+ n.end(0)
print data[idx + start:tag_end]
else:
rest = data[idx + offset:]
print "No end tag"
print "Rest: [%s]" % rest
break
else:
rest = data[idx:]
print "Rest: [%s]" % rest
break
idx = tag_end
# print data[idx:idx + 80]
cnt += 1
f_in.close()