Python - HTML Parsers

From PeformIQ Upgrade
Revision as of 17:35, 23 April 2009 by PeterHarding (talk | contribs)
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
Jump to navigation Jump to search

parsers

See BeautifulSoup ...

Examples

#!/usr/bin/env python

import re

f_in = open("Some.html", "r")

data = f_in.read()

s = re.compile('(<[^ >]*)')
e = re.compile('>')

idx = 0

cnt = 0

l   = len(data) - 2

tag_end = None

while True:
   m = s.search(data[idx:])

   if m:
      # print m.group(0)
      start  = m.start(0)
      offset =  m.end(0)

      # print idx, start, offset

      if tag_end:
         # print "%s %s" % (tag_end, idx + start)

         if idx + start != tag_end:  # have some data!
            text = data[tag_end:idx + start]
            print "%s" % text

      n = e.search(data[idx + offset:])

      if n:
         tag_end = idx + offset+ n.end(0)
         print data[idx + start:tag_end]
      else:
         rest = data[idx + offset:]
         print "No end tag"
         print "Rest: [%s]" % rest
         break

   else:
      rest = data[idx:]
      print "Rest: [%s]" % rest
      break

   idx = tag_end
   # print data[idx:idx + 80]

   cnt += 1

f_in.close()