Parsing WhitePages Search Results HTML
Searching WhitePages
The following Python script uses HTTPLib to search against whitepages.com.au (via a proxy).
#!/usr/bin/env python
#
#
#-------------------------------------------------------------------------------
import re
import sys
import base64
import pprint
import urllib
import httplib
from copy import copy
#-------------------------------------------------------------------------------
PROXY = 'PROXY:8080'
SITE = 'www.whitepages.com.au'
connection = None
#===== Headers =================================================================
# 'Accept' : 'text/plain, text/html',
get_headers = {
'Accept-Encoding' : 'gzip, deflate',
'Accept' : '*/*',
'Accept-Language' : 'en-au',
'Host' : SITE,
'Connection' : 'Keep-Alive',
'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727'
}
post_headers = {
'Content-type' : 'application/x-www-form-urlencoded',
'Accept' : 'text/plain'
}
idx = 0
#===== Logging =================================================================
def log_req_header(idx, hdr):
of = open('log/%04d.req' % idx, 'w')
of.write("%s\n" % pprint.pformat(hdr))
of.close()
#-------------------------------------------------------------------------------
def log_resp_header(idx, resp):
of = open('log/%04d.hdr' % idx, 'w')
of.write("resp.__dict__ ->\n%s\n\n" % pprint.pformat(resp.__dict__))
of.write("Status %s Reason [%s]\n" % (resp.status, resp.reason))
of.write("Msg ->\n%s\n\n" % resp.msg)
of.write("Msg.__dict__ ->\n%s\n\n" % pprint.pformat(resp.msg.__dict__))
of.close()
#-------------------------------------------------------------------------------
def log_resp_body(idx, resp_body):
of = open('log/%04d.bdy' % idx, 'w')
of.write(resp_body);
of.close()
#===== Encapsulate the request code ============================================
def request(method, url, params, headers):
global idx
print '>>>> %s %s <<<<' % (method, url)
connection.request(method, url, params, headers)
resp = connection.getresponse()
log_req_header(idx, headers)
log_resp_header(idx, resp)
resp_body = resp.read()
log_resp_body(idx, resp_body)
print resp_body
idx += 1
return resp
#===============================================================================
def do():
global connection
connection = httplib.HTTPConnection(PROXY)
BASE_URL = 'http://%s' % SITE
#------------------------------------------------------------------------
DO = 'GET'
URL = BASE_URL + '/'
headers = copy(get_headers)
request(DO, URL, None, headers)
#------------------------------------------------------------------------
DO = 'GET'
URL = BASE_URL + '/wp/index.jsp'
headers = copy(get_headers)
resp = request(DO, URL, None, headers)
m = re.search('JSESSIONID=(.*);', resp.msg.__dict__['dict']['set-cookie'])
if m:
print m.group(1)
JSESSIONID = m.group(1)
print JSESSIONID
#---------------------------------------------------------------------
DO = 'POST'
URL = BASE_URL + '/wp/busSearch.do;jsessionid=%s' % JSESSIONID
headers = copy(post_headers)
form_data = {
'subscriberName' : 'Hard',
'state' : 'VIC',
'suburb' : '',
'street' : '',
'Search' : 'Search'
}
params = urllib.urlencode(form_data)
headers['Content-Length'] = len(params)
headers['Cookie'] = 'JSESSIONID=%s' % JSESSIONID
request(DO, URL, params, headers)
#---------------------------------------------------------------------
URL = BASE_URL + '/wp/busSearch.do'
form_data = {
'subscriberName' : 'Hard',
'state' : 'VIC',
'page' : '2'
}
params = urllib.urlencode(form_data)
headers['Content-Length'] = len(params)
headers['Cookie'] = 'JSESSIONID=%s' % JSESSIONID
request(DO, URL, params, headers)
#===============================================================================
do()
#-------------------------------------------------------------------------------
This scripts writes the search results into files (page 1 => 'log/0002.bdy' and page 2 => 'log/0003.bdy'). Amend the above code handle more pages of search results being produced.
Parsing the Search Results
The search results HTML looks as follows:
<div class="encap_result" id="result-10"><ul><li id='res10-ln0'><h4><span class='blackboldcaps'>Hard On Tools</span></h4></li><li class='subMultiContainer' id='res10-ln1'><ul><li class='entryData address'>5 Scoresby Rd Bayswater 3153</li><li class='entryData phoneNumber'><span class='blackboldcaps'>(03) 9720 5199</span></li><li class='entryData whereIsMap'><form class='mapForm' action='whereIs.do' method='post'><input type='hidden' name='streetNumber' value='5' /><input type='hidden' name='streetName' value='Scoresby' /><input type='hidden' name='streetType' value='Rd' /><input type='hidden' name='locality' value='Bayswater' /><input type='hidden' name='state' value='VIC' /><input type='hidden' name='placeName' value='Hard On Tools' /><input type='hidden' name='address' value='5 Scoresby Rd Bayswater 3153' /><input type='hidden' name='phoneNumber' value='(03) 9720 5199' /><input type='hidden' name='link' value='1197346075688' /><input type='hidden' name='hashCode' value='152335231501250' /><input type='hidden' name='brandId' value='5' /><input type='hidden' name='logData' value='' /><input type='hidden' name='subscriberName' value='Hard On Tools' /><input name='mapSubmit' src='images/result-map.gif' alt='Map' type='image' /></form></li></ul></li><li class='subMultiContainer' id='res10-ln2'><ul><li class='entryData address'><h5><span class='black'>OR...</span></h5></li><li class='entryData phoneNumber'>(03) 9738 2882</li></ul></li><li class='subMultiContainer' id='res10-ln3'><ul><li class='entryData address indent2'><h5><span class='black'>Fax</span></h5></li><li class='entryData phoneNumber'>(03) 9720 0966</li></ul></li></ul><div class="clearMe"> </div></div><div class="encap_result" id="result-11"><ul><li>=Results of Search= The following ZIP file contains a selection of company details extracted from Whitepages... [http://www.performiq.com.au/kb/images/WhitepageSearchResults.zip WhitepageSearchResults.zip] [[Category:Python]][[Category:Internet]] <h4><span class='black'>Hard Parts Victoria</span></h4></li>
This following script, which makes use of the BeautifulSoup package, is a first cut of at parsing out the useful data from the WhitePages results HTML (see here for BeautifulSoap home page);
#!/usr/bin/env python
import pprint
from BeautifulSoup import BeautifulSoup
doc = open('log/0002.html', 'r')
soup = BeautifulSoup(doc)
# print len(soup('table', { "class" : "table_style"}))
# tables = soup.findAll('table', { "class" : "table_style"})
objs = soup.findAll('div', { "class" : "encap_result"})
pp = pprint.PrettyPrinter(3)
for obj in objs:
t = obj.find(text=True)
if t:
print "==========================================="
# print t
#print '[[%s]]\n\n' % obj.__dict__
# print '[[%s]]\n\n' % obj
f = obj.findAll('span', { 'class' : 'black'})
for s in f:
print 'span="black" -> "%s"' % s.find(text=True)
f = obj.findAll('input', { "name" : 'placeName'})
for s in f:
# pp.pprint(s.__dict__)
# print 'attrMap -> "%s"' % s.attrMap
print 'placeName -> "%s"' % s.attrMap['value']
for s in obj.findAll('input', { "name" : 'placeName'}):
print 'placeName -> "%s"' % s.attrMap['value']
for s in obj.findAll('input', { "name" : 'address'}):
print 'address -> "%s"' % s.attrMap['value']
for s in obj.findAll('input', { "name" : 'locality'}):
print 'locality -> "%s"' % s.attrMap['value']
for s in obj.findAll('input', { "name" : 'streetNumber'}):
print 'streetNumber -> "%s"' % s.attrMap['value']
for s in obj.findAll('input', { "name" : 'streetName'}):
print 'streetName -> "%s"' % s.attrMap['value']
for s in obj.findAll('input', { "name" : 'streetType'}):
print 'streetType -> "%s"' % s.attrMap['value']
lis = obj.findAll('li', { "class" : None})
for li in lis:
print 'li -> "%s"' % li.find(text=True)
addresses = obj.findAll('li', { "class" : "entryData address"})
for address in addresses:
print 'addr -> "%s"' % address.find(text=True)
phone_numbers = obj.findAll('li', { "class" : "entryData phone"})
for phone in phone_numbers:
print 'phone -> "%s"' % phone.find(text=True)
A more Complex Script
This script loops through all the available pages in the search results and parses out the search results using the BeautifulSoup HTTP parsing module.
wp_get.py
#!/usr/bin/env python
#
#
#-------------------------------------------------------------------------------
import re
import sys
import base64
import pprint
import urllib
import httplib
from copy import copy
#-------------------------------------------------------------------------------
from wp_parser import parse
#-------------------------------------------------------------------------------
PROXY = 'PROXY:8080'
SITE = 'www.whitepages.com.au'
connection = None
next_page_pat = None
results = ''
#-------------------------------------------------------------------------------
# 'Accept' : 'text/plain, text/html',
get_headers = {
'Accept-Encoding' : 'gzip, deflate',
'Accept' : '*/*',
'Accept-Language' : 'en-au',
'Host' : SITE,
'Connection' : 'Keep-Alive',
'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727'
}
post_headers = {
'Content-type' : 'application/x-www-form-urlencoded',
'Accept' : 'text/plain'
}
idx = 0
#-------------------------------------------------------------------------------
def log_req_header(idx, hdr):
of = open('log/%04d.req' % idx, 'w')
of.write("%s\n" % pprint.pformat(hdr))
of.close()
#-------------------------------------------------------------------------------
def log_resp_header(idx, resp):
of = open('log/%04d.hdr' % idx, 'w')
of.write("resp.__dict__ ->\n%s\n\n" % pprint.pformat(resp.__dict__))
of.write("Status %s Reason [%s]\n" % (resp.status, resp.reason))
of.write("Msg ->\n%s\n\n" % resp.msg)
of.write("Msg.__dict__ ->\n%s\n\n" % pprint.pformat(resp.msg.__dict__))
of.close()
#-------------------------------------------------------------------------------
def log_resp_body(idx, resp_body):
of = open('log/%04d.bdy' % idx, 'w')
of.write(resp_body);
of.close()
#-------------------------------------------------------------------------------
def do(pattern, state):
global connection
global results
print "Pattern ==> '%s'" % pattern
connection = httplib.HTTPConnection(PROXY)
BASE_URL = 'http://%s' % SITE
#------------------------------------------------------------------------
DO = 'GET'
URL = BASE_URL + '/'
headers = copy(get_headers)
request(DO, URL, None, headers)
#------------------------------------------------------------------------
DO = 'GET'
URL = BASE_URL + '/wp/index.jsp'
headers = copy(get_headers)
r = request(DO, URL, None, headers)
m = re.search('JSESSIONID=(.*);', r['response_header'].msg.__dict__['dict']['set-cookie'])
if m:
print m.group(1)
JSESSIONID = m.group(1)
print JSESSIONID
#---------------------------------------------------------------------
DO = 'POST'
URL = BASE_URL + '/wp/busSearch.do;jsessionid=%s' % JSESSIONID
headers = copy(post_headers)
form_data = {
'subscriberName' : pattern,
'state' : state,
'suburb' : '',
'street' : '',
'Search' : 'Search'
}
params = urllib.urlencode(form_data)
headers['Content-Length'] = len(params)
headers['Cookie'] = 'JSESSIONID=%s' % JSESSIONID
r = request(DO, URL, params, headers)
xml = parse('log/%04d.bdy' % r['idx'])
results += xml
body = r['response_body']
m = next_page_pat.findall(body)
cnt = 0
if m:
no_pages = len(m)
if no_pages > 0:
for i in range(no_pages):
print m[i]
#print m[i].group(1)
cnt = no_pages - 1
#---------------------------------------------------------------------
for i in range(cnt):
URL = BASE_URL + '/wp/busSearch.do'
form_data = {
'subscriberName' : pattern,
'state' : state,
'page' : i + 1
}
params = urllib.urlencode(form_data)
headers['Content-Length'] = len(params)
headers['Cookie'] = 'JSESSIONID=%s' % JSESSIONID
r = request(DO, URL, params, headers)
xml = parse('log/%04d.bdy' % r['idx'])
results += xml
ofh = open('results.xml', 'a+')
ofh.write(results)
ofh.close()
#-------------------------------------------------------------------------------
def request(method, url, params, headers):
global idx
print '>>>> %s %s <<<<' % (method, url)
connection.request(method, url, params, headers)
resp = connection.getresponse()
log_req_header(idx, headers)
log_resp_header(idx, resp)
resp_body = resp.read()
log_resp_body(idx, resp_body)
# print resp_body
r = {'idx' : idx, 'request_header' : headers, 'response_header' : resp, 'response_body' : resp_body}
idx += 1
return r
#-------------------------------------------------------------------------------
def process(search_patterns):
global next_page_pat
next_page_pat = re.compile(r';(page=[0-9]*">[0-9]*<\/a>)')
for search_pattern in search_patterns:
do(search_pattern, 'VIC')
#-------------------------------------------------------------------------------
def searches():
searches = []
for i in range(4):
ch = chr(ord('W') + i)
for x in ('a', 'e', 'i', 'o', 'u'):
pat = ch + x
searches.append(pat)
return searches
#-------------------------------------------------------------------------------
def used():
searches = []
searches.append('Zeus')
#-------------------------------------------------------------------------------
def main():
searches = []
searches.append('Zany')
searches.append('Zan')
searches.append('Zen')
searches.append('Zend')
process(searches)
#-------------------------------------------------------------------------------
main()
#-------------------------------------------------------------------------------
wp_parse.py
#!/usr/bin/env python
import pprint
from BeautifulSoup import BeautifulSoup
#-------------------------------------------------------------------------------------
def parse(fname):
doc = open(fname, 'r')
soup = BeautifulSoup(doc)
# print len(soup('table', { "class" : "table_style"}))
# tables = soup.findAll('table', { "class" : "table_style"})
objs = soup.findAll('div', { "class" : "encap_result"})
pp = pprint.PrettyPrinter(3)
xml = ''
for obj in objs:
t = obj.find(text=True)
if t:
xml += '<entry>\n'
#print '[[%s]]\n\n' % obj.__dict__
# print '[[%s]]\n\n' % obj
f = obj.findAll('span', { 'class' : 'black'})
for s in f:
xml += ' <tag>%s</tag>\n' % s.find(text=True)
f = obj.findAll('input', { "name" : 'placeName'})
for s in f:
# pp.pprint(s.__dict__)
# print 'attrMap -> "%s"' % s.attrMap
xml += ' <placeName>%s</placeName>\n' % s.attrMap['value']
for s in obj.findAll('input', { "name" : 'subscriberName'}):
xml += ' <subscriberName>%s</subscriberName>\n' % s.attrMap['value']
for s in obj.findAll('input', { "name" : 'address'}):
xml += ' <address>%s</address>\n' % s.attrMap['value']
for s in obj.findAll('input', { "name" : 'streetNumber'}):
xml += ' <streetNumber>%s</streetNumber>\n' % s.attrMap['value']
for s in obj.findAll('input', { "name" : 'streetName'}):
xml += ' <streetName>%s</streetName>\n' % s.attrMap['value']
for s in obj.findAll('input', { "name" : 'streetType'}):
xml += ' <streetType>%s</streetType>\n' % s.attrMap['value']
for s in obj.findAll('input', { "name" : 'locality'}):
xml += ' <locality>%s</locality>\n' % s.attrMap['value']
lis = obj.findAll('li', { "class" : None})
for li in lis:
xml += ' <li>%s</li>\n' % li.find(text=True)
for address in obj.findAll('li', { "class" : "entryData address"}):
xml += ' <addr>%s</addr>\n' % address.find(text=True)
for phone in obj.findAll('li', { "class" : "entryData phoneNumber"}):
xml += ' <phoneNumber>%s</phoneNumber>\n' % phone.find(text=True)
xml += '</entry>\n\n'
return xml
#-------------------------------------------------------------------------------------
def test():
# xml = parse('html/0002.html')
# print xml
#-------------------------------------------------------------------------------------
"""
<div class="encap_result" id="result-10"><ul><li id='res10-ln0'><h4><span class='blackboldcaps'>Hard On Tools</span></h4></li><li class='subMultiContainer' id='res10-ln1'><ul><li class='entryData address'>5 Scoresby Rd Bayswater 3153</li><li class='entryData phoneNumber'><span class='blackboldcaps'>(03) 9720 5199</span></li><li class='entryData whereIsMap'><form class='mapForm' action='whereIs.do' method='post'><input type='hidden' name='streetNumber' value='5' /><input type='hidden' name='streetName' value='Scoresby' /><input type='hidden' name='streetType' value='Rd' /><input type='hidden' name='locality' value='Bayswater' /><input type='hidden' name='state' value='VIC' /><input type='hidden' name='placeName' value='Hard On Tools' /><input type='hidden' name='address' value='5 Scoresby Rd Bayswater 3153' /><input type='hidden' name='phoneNumber' value='(03) 9720 5199' /><input type='hidden' name='link' value='1197346075688' /><input type='hidden' name='hashCode' value='152335231501250' /><input type='hidden' name='brandId' value='5' /><input type='hidden' name='logData' value='' /><input type='hidden' name='subscriberName' value='Hard On Tools' /><input name='mapSubmit' src='images/result-map.gif' alt='Map' type='image' /></form></li></ul></li><li class='subMultiContainer' id='res10-ln2'><ul><li class='entryData address'><h5><span class='black'>OR...</span></h5></li><li class='entryData phoneNumber'>(03) 9738 2882</li></ul></li><li class='subMultiContainer' id='res10-ln3'><ul><li class='entryData address indent2'><h5><span class='black'>Fax</span></h5></li><li class='entryData phoneNumber'>(03) 9720 0966</li></ul></li></ul><div class="clearMe"> </div></div><div class="encap_result" id="result-11"><ul><li><h4><span class='black'>Hard Parts Australia Pty Ltd</span></h4></li>
<li class='entryData address'>14 Yiannis Crt Springvale 3171</li><li class='entryData phoneNumber'>0418 756 340</li><li class='entryData whereIsMap'><form class='mapForm' action='whereIs.do' method='post'><input type='hidden' name='streetNumber' value='14' /><input type='hidden' name='streetName' value='Yiannis' /><input type='hidden' name='streetType' value='Crt' /><input type='hidden' name='locality' value='Springvale' /><input type='hidden' name='state' value='VIC' /><input type='hidden' name='placeName' value='Hard Parts Australia Pty Ltd' /><input type='hidden' name='address' value='14 Yiannis Crt Springvale 3171' /><input type='hidden' name='phoneNumber' value='0418 756 340' /><input type='hidden' name='link' value='1197346075688' /><input type='hidden' name='hashCode' value='152335231501250' /><input type='hidden' name='brandId' value='5' /><input type='hidden' name='logData' value='' /><input type='hidden' name='subscriberName' value='Hard Parts Australia Pty Ltd' /><input name='mapSubmit' src='images/result-map.gif' alt='Map' type='image' /></form></li></ul><div class="clearMe"> </div></div><div class="encap_result" id="result-12"><ul><li><h4><span class='black'>Hard Parts Victoria</span></h4></li>
"""
Results of Search
The following ZIP file contains a selection of company details extracted from Whitepages...
[1]