Difference between revisions of "Python - HTML Parsers"

From PeformIQ Upgrade
Jump to navigation Jump to search
(New page: =parsers= See BeautifulSoup ... ==Examples== <pre> #!/usr/bin/env python import re f_in = open("BO_1001_Body_01.txt", "r") data = f_in.read() s = re.compile('(<[^ >]*)') e = re.comp...)
 
 
Line 10: Line 10:
import re
import re


f_in = open("BO_1001_Body_01.txt", "r")
f_in = open("Some.html", "r")


data = f_in.read()
data = f_in.read()

Latest revision as of 18:35, 23 April 2009

parsers

See BeautifulSoup ...

Examples

#!/usr/bin/env python

import re

f_in = open("Some.html", "r")

data = f_in.read()

s = re.compile('(<[^ >]*)')
e = re.compile('>')

idx = 0

cnt = 0

l   = len(data) - 2

tag_end = None

while True:
   m = s.search(data[idx:])

   if m:
      # print m.group(0)
      start  = m.start(0)
      offset =  m.end(0)

      # print idx, start, offset

      if tag_end:
         # print "%s %s" % (tag_end, idx + start)

         if idx + start != tag_end:  # have some data!
            text = data[tag_end:idx + start]
            print "%s" % text

      n = e.search(data[idx + offset:])

      if n:
         tag_end = idx + offset+ n.end(0)
         print data[idx + start:tag_end]
      else:
         rest = data[idx + offset:]
         print "No end tag"
         print "Rest: [%s]" % rest
         break

   else:
      rest = data[idx:]
      print "Rest: [%s]" % rest
      break

   idx = tag_end
   # print data[idx:idx + 80]

   cnt += 1

f_in.close()