Parsing WhitePages Search Results HTML
Revision as of 12:00, 13 December 2007 by PeterHarding (talk | contribs) (New page: = Example Script = This script makes use of the BeautifulSoup Package (see [xxx]); #!/usr/bin/env python import pprint from BeautifulSoup import BeautifulSoup doc = open('log/0...)
Example Script
This script makes use of the BeautifulSoup Package (see [xxx]);
#!/usr/bin/env python
import pprint
from BeautifulSoup import BeautifulSoup
doc = open('log/0002.html', 'r')
soup = BeautifulSoup(doc)
# print len(soup('table', { "class" : "table_style"}))
# tables = soup.findAll('table', { "class" : "table_style"})
objs = soup.findAll('div', { "class" : "encap_result"})
pp = pprint.PrettyPrinter(3)
for obj in objs:
t = obj.find(text=True)
if t:
print "==========================================="
# print t
#print '%s\n\n' % obj.__dict__
# print '%s\n\n' % obj
f = obj.findAll('span', { 'class' : 'black'})
for s in f:
print 'span="black" -> "%s"' % s.find(text=True)
f = obj.findAll('input', { "name" : 'placeName'})
for s in f:
# pp.pprint(s.__dict__)
# print 'attrMap -> "%s"' % s.attrMap
print 'placeName -> "%s"' % s.attrMap['value']
for s in obj.findAll('input', { "name" : 'placeName'}):
print 'placeName -> "%s"' % s.attrMap['value']
for s in obj.findAll('input', { "name" : 'address'}):
print 'address -> "%s"' % s.attrMap['value']
for s in obj.findAll('input', { "name" : 'locality'}):
print 'locality -> "%s"' % s.attrMap['value']
for s in obj.findAll('input', { "name" : 'streetNumber'}):
print 'streetNumber -> "%s"' % s.attrMap['value']
for s in obj.findAll('input', { "name" : 'streetName'}):
print 'streetName -> "%s"' % s.attrMap['value']
for s in obj.findAll('input', { "name" : 'streetType'}):
print 'streetType -> "%s"' % s.attrMap['value']
lis = obj.findAll('li', { "class" : None})
for li in lis:
print 'li -> "%s"' % li.find(text=True)
addresses = obj.findAll('li', { "class" : "entryData address"})
for address in addresses:
print 'addr -> "%s"' % address.find(text=True)
phone_numbers = obj.findAll('li', { "class" : "entryData phone"})
for phone in phone_numbers:
print 'phone -> "%s"' % phone.find(text=True)