Parsing WhitePages Search Results HTML

From PeformIQ Upgrade
Revision as of 12:00, 13 December 2007 by PeterHarding (talk | contribs) (New page: = Example Script = This script makes use of the BeautifulSoup Package (see [xxx]); #!/usr/bin/env python import pprint from BeautifulSoup import BeautifulSoup doc = open('log/0...)
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
Jump to navigation Jump to search

Example Script

This script makes use of the BeautifulSoup Package (see [xxx]);

#!/usr/bin/env python

import pprint

from BeautifulSoup import BeautifulSoup

doc = open('log/0002.html', 'r')

soup = BeautifulSoup(doc)

# print len(soup('table', { "class" : "table_style"}))

# tables = soup.findAll('table', { "class" : "table_style"})
objs = soup.findAll('div', { "class" : "encap_result"})

pp = pprint.PrettyPrinter(3)

for obj in objs:
   t = obj.find(text=True)

   if t:
      print "==========================================="
      # print t

   #print '%s\n\n' % obj.__dict__
   # print '%s\n\n' % obj

      f    = obj.findAll('span',  { 'class' : 'black'})

      for s in f:
         print 'span="black" -> "%s"' % s.find(text=True)

      f    = obj.findAll('input',  { "name" : 'placeName'})

      for s in f:
         # pp.pprint(s.__dict__)
         # print 'attrMap -> "%s"' % s.attrMap
         print 'placeName -> "%s"' % s.attrMap['value']


      for s in obj.findAll('input',  { "name" : 'placeName'}):
         print 'placeName -> "%s"' % s.attrMap['value']

      for s in obj.findAll('input',  { "name" : 'address'}):
         print 'address -> "%s"' % s.attrMap['value']

      for s in obj.findAll('input',  { "name" : 'locality'}):
         print 'locality -> "%s"' % s.attrMap['value']

      for s in obj.findAll('input',  { "name" : 'streetNumber'}):
         print 'streetNumber -> "%s"' % s.attrMap['value']

      for s in obj.findAll('input',  { "name" : 'streetName'}):
         print 'streetName -> "%s"' % s.attrMap['value']

      for s in obj.findAll('input',  { "name" : 'streetType'}):
         print 'streetType -> "%s"' % s.attrMap['value']


      lis       = obj.findAll('li',  { "class" : None})

      for li in lis:
         print 'li -> "%s"' % li.find(text=True)

      addresses      = obj.findAll('li',  { "class" : "entryData address"})

      for address in addresses:
         print 'addr -> "%s"' % address.find(text=True)

      phone_numbers  = obj.findAll('li',  { "class" : "entryData phone"})

      for phone in phone_numbers:
         print 'phone -> "%s"' % phone.find(text=True)