Difference between revisions of "Parsing WhitePages Search Results HTML"
Jump to navigation
Jump to search
PeterHarding (talk | contribs) |
PeterHarding (talk | contribs) |
||
Line 2: | Line 2: | ||
The following Python script uses HTTPLib to search against whitepages.com.au | The following Python script uses HTTPLib to search against whitepages.com.au | ||
#!/usr/bin/env python | |||
# | |||
# | |||
#------------------------------------------------------------------------------- | |||
import re | |||
import sys | |||
import base64 | |||
import pprint | |||
import urllib | |||
import httplib | |||
from copy import copy | |||
#------------------------------------------------------------------------------- | |||
PROXY = 'PROXY:8080' | |||
SITE = 'www.whitepages.com.au' | |||
connection = None | |||
#------------------------------------------------------------------------------- | |||
# 'Accept' : 'text/plain, text/html', | |||
get_headers = { | |||
'Accept-Encoding' : 'gzip, deflate', | |||
'Accept' : '*/*', | |||
'Accept-Language' : 'en-au', | |||
'Host' : SITE, | |||
'Connection' : 'Keep-Alive', | |||
'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727' | |||
} | |||
post_headers = { | |||
'Content-type' : 'application/x-www-form-urlencoded', | |||
'Accept' : 'text/plain' | |||
} | |||
idx = 0 | |||
#------------------------------------------------------------------------------- | |||
def log_req_header(idx, hdr): | |||
of = open('log/%04d.req' % idx, 'w') | |||
of.write("%s\n" % pprint.pformat(hdr)) | |||
of.close() | |||
#------------------------------------------------------------------------------- | |||
def log_resp_header(idx, resp): | |||
of = open('log/%04d.hdr' % idx, 'w') | |||
of.write("resp.__dict__ ->\n%s\n\n" % pprint.pformat(resp.__dict__)) | |||
of.write("Status %s Reason [%s]\n" % (resp.status, resp.reason)) | |||
of.write("Msg ->\n%s\n\n" % resp.msg) | |||
of.write("Msg.__dict__ ->\n%s\n\n" % pprint.pformat(resp.msg.__dict__)) | |||
of.close() | |||
#------------------------------------------------------------------------------- | |||
def log_resp_body(idx, resp_body): | |||
of = open('log/%04d.bdy' % idx, 'w') | |||
of.write(resp_body); | |||
of.close() | |||
#------------------------------------------------------------------------------- | |||
def do(): | |||
global connection | |||
connection = httplib.HTTPConnection(PROXY) | |||
BASE_URL = 'http://%s' % SITE | |||
#------------------------------------------------------------------------ | |||
DO = 'GET' | |||
URL = BASE_URL + '/' | |||
headers = copy(get_headers) | |||
request(DO, URL, None, headers) | |||
#------------------------------------------------------------------------ | |||
DO = 'GET' | |||
URL = BASE_URL + '/wp/index.jsp' | |||
headers = copy(get_headers) | |||
resp = request(DO, URL, None, headers) | |||
m = re.search('JSESSIONID=(.*);', resp.msg.__dict__['dict']['set-cookie']) | |||
if m: | |||
print m.group(1) | |||
JSESSIONID = m.group(1) | |||
print JSESSIONID | |||
#--------------------------------------------------------------------- | |||
DO = 'POST' | |||
URL = BASE_URL + '/wp/busSearch.do;jsessionid=%s' % JSESSIONID | |||
headers = copy(post_headers) | |||
form_data = { | |||
'subscriberName' : 'Hard', | |||
'state' : 'VIC', | |||
'suburb' : '', | |||
'street' : '', | |||
'Search' : 'Search' | |||
} | |||
params = urllib.urlencode(form_data) | |||
headers['Content-Length'] = len(params) | |||
headers['Cookie'] = 'JSESSIONID=%s' % JSESSIONID | |||
request(DO, URL, params, headers) | |||
#--------------------------------------------------------------------- | |||
URL = BASE_URL + '/wp/busSearch.do' | |||
form_data = { | |||
'subscriberName' : 'Hard', | |||
'state' : 'VIC', | |||
'page' : '2' | |||
} | |||
params = urllib.urlencode(form_data) | |||
headers['Content-Length'] = len(params) | |||
headers['Cookie'] = 'JSESSIONID=%s' % JSESSIONID | |||
request(DO, URL, params, headers) | |||
#------------------------------------------------------------------------------- | |||
def request(method, url, params, headers): | |||
global idx | |||
print '>>>> %s %s <<<<' % (method, url) | |||
connection.request(method, url, params, headers) | |||
resp = connection.getresponse() | |||
log_req_header(idx, headers) | |||
log_resp_header(idx, resp) | |||
resp_body = resp.read() | |||
log_resp_body(idx, resp_body) | |||
print resp_body | |||
idx += 1 | |||
return resp | |||
#------------------------------------------------------------------------------- | |||
do() | |||
#------------------------------------------------------------------------------- | |||
Revision as of 12:06, 13 December 2007
Searching WhitePages
The following Python script uses HTTPLib to search against whitepages.com.au
#!/usr/bin/env python # # #------------------------------------------------------------------------------- import re import sys import base64 import pprint import urllib import httplib from copy import copy #------------------------------------------------------------------------------- PROXY = 'PROXY:8080' SITE = 'www.whitepages.com.au' connection = None #------------------------------------------------------------------------------- # 'Accept' : 'text/plain, text/html', get_headers = { 'Accept-Encoding' : 'gzip, deflate', 'Accept' : '*/*', 'Accept-Language' : 'en-au', 'Host' : SITE, 'Connection' : 'Keep-Alive', 'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727' } post_headers = { 'Content-type' : 'application/x-www-form-urlencoded', 'Accept' : 'text/plain' } idx = 0 #------------------------------------------------------------------------------- def log_req_header(idx, hdr): of = open('log/%04d.req' % idx, 'w') of.write("%s\n" % pprint.pformat(hdr)) of.close() #------------------------------------------------------------------------------- def log_resp_header(idx, resp): of = open('log/%04d.hdr' % idx, 'w') of.write("resp.__dict__ ->\n%s\n\n" % pprint.pformat(resp.__dict__)) of.write("Status %s Reason [%s]\n" % (resp.status, resp.reason)) of.write("Msg ->\n%s\n\n" % resp.msg) of.write("Msg.__dict__ ->\n%s\n\n" % pprint.pformat(resp.msg.__dict__)) of.close() #------------------------------------------------------------------------------- def log_resp_body(idx, resp_body): of = open('log/%04d.bdy' % idx, 'w') of.write(resp_body); of.close() #------------------------------------------------------------------------------- def do(): global connection connection = httplib.HTTPConnection(PROXY) BASE_URL = 'http://%s' % SITE #------------------------------------------------------------------------ DO = 'GET' URL = BASE_URL + '/' headers = copy(get_headers) request(DO, URL, None, headers) #------------------------------------------------------------------------ DO = 'GET' URL = BASE_URL + '/wp/index.jsp' headers = copy(get_headers) resp = request(DO, URL, None, headers) m = re.search('JSESSIONID=(.*);', resp.msg.__dict__['dict']['set-cookie']) if m: print m.group(1) JSESSIONID = m.group(1) print JSESSIONID #--------------------------------------------------------------------- DO = 'POST' URL = BASE_URL + '/wp/busSearch.do;jsessionid=%s' % JSESSIONID headers = copy(post_headers) form_data = { 'subscriberName' : 'Hard', 'state' : 'VIC', 'suburb' : , 'street' : , 'Search' : 'Search' } params = urllib.urlencode(form_data) headers['Content-Length'] = len(params) headers['Cookie'] = 'JSESSIONID=%s' % JSESSIONID request(DO, URL, params, headers) #--------------------------------------------------------------------- URL = BASE_URL + '/wp/busSearch.do' form_data = { 'subscriberName' : 'Hard', 'state' : 'VIC', 'page' : '2' } params = urllib.urlencode(form_data) headers['Content-Length'] = len(params) headers['Cookie'] = 'JSESSIONID=%s' % JSESSIONID request(DO, URL, params, headers) #------------------------------------------------------------------------------- def request(method, url, params, headers): global idx print '>>>> %s %s <<<<' % (method, url) connection.request(method, url, params, headers) resp = connection.getresponse() log_req_header(idx, headers) log_resp_header(idx, resp) resp_body = resp.read() log_resp_body(idx, resp_body) print resp_body idx += 1 return resp #------------------------------------------------------------------------------- do() #-------------------------------------------------------------------------------
Parsing the Search Results
The search results HTML looks as follows:
Hard On Tools
- 5 Scoresby Rd Bayswater 3153
- (03) 9720 5199
- <form class='mapForm' action='whereIs.do' method='post'><input type='hidden' name='streetNumber' value='5' /><input type='hidden' name='streetName' value='Scoresby' /><input type='hidden' name='streetType' value='Rd' /><input type='hidden' name='locality' value='Bayswater' /><input type='hidden' name='state' value='VIC' /><input type='hidden' name='placeName' value='Hard On Tools' /><input type='hidden' name='address' value='5 Scoresby Rd Bayswater 3153' /><input type='hidden' name='phoneNumber' value='(03) 9720 5199' /><input type='hidden' name='link' value='1197346075688' /><input type='hidden' name='hashCode' value='152335231501250' /><input type='hidden' name='brandId' value='5' /><input type='hidden' name='logData' value= /><input type='hidden' name='subscriberName' value='Hard On Tools' /><input name='mapSubmit' src='images/result-map.gif' alt='Map' type='image' /></form>
OR...
- (03) 9738 2882
Fax
- (03) 9720 0966
Hard Parts Australia Pty Ltd
- 14 Yiannis Crt Springvale 3171
- 0418 756 340
- <form class='mapForm' action='whereIs.do' method='post'><input type='hidden' name='streetNumber' value='14' /><input type='hidden' name='streetName' value='Yiannis' /><input type='hidden' name='streetType' value='Crt' /><input type='hidden' name='locality' value='Springvale' /><input type='hidden' name='state' value='VIC' /><input type='hidden' name='placeName' value='Hard Parts Australia Pty Ltd' /><input type='hidden' name='address' value='14 Yiannis Crt Springvale 3171' /><input type='hidden' name='phoneNumber' value='0418 756 340' /><input type='hidden' name='link' value='1197346075688' /><input type='hidden' name='hashCode' value='152335231501250' /><input type='hidden' name='brandId' value='5' /><input type='hidden' name='logData' value= /><input type='hidden' name='subscriberName' value='Hard Parts Australia Pty Ltd' /><input name='mapSubmit' src='images/result-map.gif' alt='Map' type='image' /></form>
Hard Parts Victoria
This script makes use of the BeautifulSoup Package (see [xxx]);
#!/usr/bin/env python import pprint from BeautifulSoup import BeautifulSoup doc = open('log/0002.html', 'r') soup = BeautifulSoup(doc) # print len(soup('table', { "class" : "table_style"})) # tables = soup.findAll('table', { "class" : "table_style"}) objs = soup.findAll('div', { "class" : "encap_result"}) pp = pprint.PrettyPrinter(3) for obj in objs: t = obj.find(text=True) if t: print "===========================================" # print t #print '%s\n\n' % obj.__dict__ # print '%s\n\n' % obj f = obj.findAll('span', { 'class' : 'black'}) for s in f: print 'span="black" -> "%s"' % s.find(text=True) f = obj.findAll('input', { "name" : 'placeName'}) for s in f: # pp.pprint(s.__dict__) # print 'attrMap -> "%s"' % s.attrMap print 'placeName -> "%s"' % s.attrMap['value'] for s in obj.findAll('input', { "name" : 'placeName'}): print 'placeName -> "%s"' % s.attrMap['value'] for s in obj.findAll('input', { "name" : 'address'}): print 'address -> "%s"' % s.attrMap['value'] for s in obj.findAll('input', { "name" : 'locality'}): print 'locality -> "%s"' % s.attrMap['value'] for s in obj.findAll('input', { "name" : 'streetNumber'}): print 'streetNumber -> "%s"' % s.attrMap['value'] for s in obj.findAll('input', { "name" : 'streetName'}): print 'streetName -> "%s"' % s.attrMap['value'] for s in obj.findAll('input', { "name" : 'streetType'}): print 'streetType -> "%s"' % s.attrMap['value'] lis = obj.findAll('li', { "class" : None}) for li in lis: print 'li -> "%s"' % li.find(text=True) addresses = obj.findAll('li', { "class" : "entryData address"}) for address in addresses: print 'addr -> "%s"' % address.find(text=True) phone_numbers = obj.findAll('li', { "class" : "entryData phone"}) for phone in phone_numbers: print 'phone -> "%s"' % phone.find(text=True)