Parsing WhitePages Search Results HTML
Revision as of 12:03, 13 December 2007 by PeterHarding (talk | contribs)
Searching WhitePages
The following Python script uses HTTPLib to search against whitepages.com.au
Parsing the Search Results
The search results HTML looks as follows:
Hard On Tools
- 5 Scoresby Rd Bayswater 3153
- (03) 9720 5199
- <form class='mapForm' action='whereIs.do' method='post'><input type='hidden' name='streetNumber' value='5' /><input type='hidden' name='streetName' value='Scoresby' /><input type='hidden' name='streetType' value='Rd' /><input type='hidden' name='locality' value='Bayswater' /><input type='hidden' name='state' value='VIC' /><input type='hidden' name='placeName' value='Hard On Tools' /><input type='hidden' name='address' value='5 Scoresby Rd Bayswater 3153' /><input type='hidden' name='phoneNumber' value='(03) 9720 5199' /><input type='hidden' name='link' value='1197346075688' /><input type='hidden' name='hashCode' value='152335231501250' /><input type='hidden' name='brandId' value='5' /><input type='hidden' name='logData' value= /><input type='hidden' name='subscriberName' value='Hard On Tools' /><input name='mapSubmit' src='images/result-map.gif' alt='Map' type='image' /></form>
OR...
- (03) 9738 2882
Fax
- (03) 9720 0966
Hard Parts Australia Pty Ltd
- 14 Yiannis Crt Springvale 3171
- 0418 756 340
- <form class='mapForm' action='whereIs.do' method='post'><input type='hidden' name='streetNumber' value='14' /><input type='hidden' name='streetName' value='Yiannis' /><input type='hidden' name='streetType' value='Crt' /><input type='hidden' name='locality' value='Springvale' /><input type='hidden' name='state' value='VIC' /><input type='hidden' name='placeName' value='Hard Parts Australia Pty Ltd' /><input type='hidden' name='address' value='14 Yiannis Crt Springvale 3171' /><input type='hidden' name='phoneNumber' value='0418 756 340' /><input type='hidden' name='link' value='1197346075688' /><input type='hidden' name='hashCode' value='152335231501250' /><input type='hidden' name='brandId' value='5' /><input type='hidden' name='logData' value= /><input type='hidden' name='subscriberName' value='Hard Parts Australia Pty Ltd' /><input name='mapSubmit' src='images/result-map.gif' alt='Map' type='image' /></form>
Hard Parts Victoria
This script makes use of the BeautifulSoup Package (see [xxx]);
#!/usr/bin/env python import pprint from BeautifulSoup import BeautifulSoup doc = open('log/0002.html', 'r') soup = BeautifulSoup(doc) # print len(soup('table', { "class" : "table_style"})) # tables = soup.findAll('table', { "class" : "table_style"}) objs = soup.findAll('div', { "class" : "encap_result"}) pp = pprint.PrettyPrinter(3) for obj in objs: t = obj.find(text=True) if t: print "===========================================" # print t #print '%s\n\n' % obj.__dict__ # print '%s\n\n' % obj f = obj.findAll('span', { 'class' : 'black'}) for s in f: print 'span="black" -> "%s"' % s.find(text=True) f = obj.findAll('input', { "name" : 'placeName'}) for s in f: # pp.pprint(s.__dict__) # print 'attrMap -> "%s"' % s.attrMap print 'placeName -> "%s"' % s.attrMap['value'] for s in obj.findAll('input', { "name" : 'placeName'}): print 'placeName -> "%s"' % s.attrMap['value'] for s in obj.findAll('input', { "name" : 'address'}): print 'address -> "%s"' % s.attrMap['value'] for s in obj.findAll('input', { "name" : 'locality'}): print 'locality -> "%s"' % s.attrMap['value'] for s in obj.findAll('input', { "name" : 'streetNumber'}): print 'streetNumber -> "%s"' % s.attrMap['value'] for s in obj.findAll('input', { "name" : 'streetName'}): print 'streetName -> "%s"' % s.attrMap['value'] for s in obj.findAll('input', { "name" : 'streetType'}): print 'streetType -> "%s"' % s.attrMap['value'] lis = obj.findAll('li', { "class" : None}) for li in lis: print 'li -> "%s"' % li.find(text=True) addresses = obj.findAll('li', { "class" : "entryData address"}) for address in addresses: print 'addr -> "%s"' % address.find(text=True) phone_numbers = obj.findAll('li', { "class" : "entryData phone"}) for phone in phone_numbers: print 'phone -> "%s"' % phone.find(text=True)