Difference between revisions of "Parsing WhitePages Search Results HTML"
Jump to navigation
Jump to search
PeterHarding (talk | contribs) |
PeterHarding (talk | contribs) |
||
(18 intermediate revisions by the same user not shown) | |||
Line 1: | Line 1: | ||
= Searching WhitePages = | = Searching WhitePages = | ||
The following Python script uses HTTPLib to search against whitepages.com.au | The following Python script uses HTTPLib to search against whitepages.com.au (via a proxy). | ||
<pre> | |||
#!/usr/bin/env python | #!/usr/bin/env python | ||
# | # | ||
Line 24: | Line 25: | ||
connection = None | connection = None | ||
# | #===== Headers ================================================================= | ||
# 'Accept' : 'text/plain, text/html', | # 'Accept' : 'text/plain, text/html', | ||
Line 34: | Line 35: | ||
'Host' : SITE, | 'Host' : SITE, | ||
'Connection' : 'Keep-Alive', | 'Connection' : 'Keep-Alive', | ||
'User-Agent' : | 'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727' | ||
} | } | ||
post_headers = { | post_headers = { | ||
'Content-type' | 'Content-type' : 'application/x-www-form-urlencoded', | ||
'Accept' | 'Accept' : 'text/plain' | ||
} | } | ||
idx = 0 | idx = 0 | ||
# | #===== Logging ================================================================= | ||
def log_req_header(idx, hdr): | def log_req_header(idx, hdr): | ||
Line 70: | Line 71: | ||
of.close() | of.close() | ||
# | #===== Encapsulate the request code ============================================ | ||
def request(method, url, params, headers): | |||
global idx | |||
print '>>>> %s %s <<<<' % (method, url) | |||
connection.request(method, url, params, headers) | |||
resp = connection.getresponse() | |||
log_req_header(idx, headers) | |||
log_resp_header(idx, resp) | |||
resp_body = resp.read() | |||
log_resp_body(idx, resp_body) | |||
print resp_body | |||
idx += 1 | |||
return resp | |||
#=============================================================================== | |||
def do(): | def do(): | ||
Line 143: | Line 168: | ||
request(DO, URL, params, headers) | request(DO, URL, params, headers) | ||
#=============================================================================== | |||
# | |||
do() | do() | ||
#------------------------------------------------------------------------------- | #------------------------------------------------------------------------------- | ||
</pre> | |||
This script writes the search results into files (page 1 => 'log/0002.bdy' and page 2 => 'log/0003.bdy'). Amend the above code handle more pages of search results being produced. | |||
= Parsing the Search Results = | = Parsing the Search Results = | ||
Line 181: | Line 182: | ||
The search results HTML looks as follows: | The search results HTML looks as follows: | ||
<pre> | |||
<div class="encap_result" id="result-10"><ul><li id='res10-ln0'><h4><span class='blackboldcaps'>Hard ...</ul></li></ul><div class="clearMe"> </div></div><div class="encap_result" id="result-11"><ul><li> | |||
</pre> | |||
=Results of Search= | |||
Are wriiten to a file in a block of XML. | |||
=A more Complex Script= | |||
This script loops through all the available pages in the search results and parses out the search results using the BeautifulSoup HTTP parsing module. | |||
==wp_get.py== | |||
<pre> | |||
#!/usr/bin/env python | |||
# | |||
# | |||
#------------------------------------------------------------------------------- | |||
import re | |||
import sys | |||
import base64 | |||
import pprint | |||
import urllib | |||
import httplib | |||
from copy import copy | |||
#------------------------------------------------------------------------------- | |||
from wp_parser import parse | |||
#------------------------------------------------------------------------------- | |||
PROXY = 'PROXY:8080' | |||
SITE = 'www.whitepages.com.au' | |||
connection = None | |||
next_page_pat = None | |||
results = '' | |||
#------------------------------------------------------------------------------- | |||
# 'Accept' : 'text/plain, text/html', | |||
get_headers = { | |||
'Accept-Encoding' : 'gzip, deflate', | |||
'Accept' : '*/*', | |||
'Accept-Language' : 'en-au', | |||
'Host' : SITE, | |||
'Connection' : 'Keep-Alive', | |||
'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727' | |||
} | |||
post_headers = { | |||
'Content-type' : 'application/x-www-form-urlencoded', | |||
'Accept' : 'text/plain' | |||
} | |||
idx = 0 | |||
#------------------------------------------------------------------------------- | |||
def log_req_header(idx, hdr): | |||
of = open('log/%04d.req' % idx, 'w') | |||
of.write("%s\n" % pprint.pformat(hdr)) | |||
of.close() | |||
#------------------------------------------------------------------------------- | |||
def log_resp_header(idx, resp): | |||
of = open('log/%04d.hdr' % idx, 'w') | |||
of.write("resp.__dict__ ->\n%s\n\n" % pprint.pformat(resp.__dict__)) | |||
of.write("Status %s Reason [%s]\n" % (resp.status, resp.reason)) | |||
of.write("Msg ->\n%s\n\n" % resp.msg) | |||
of.write("Msg.__dict__ ->\n%s\n\n" % pprint.pformat(resp.msg.__dict__)) | |||
of.close() | |||
#------------------------------------------------------------------------------- | |||
def log_resp_body(idx, resp_body): | |||
of = open('log/%04d.bdy' % idx, 'w') | |||
of.write(resp_body); | |||
of.close() | |||
#------------------------------------------------------------------------------- | |||
def do(pattern, state): | |||
global connection | |||
global results | |||
print "Pattern ==> '%s'" % pattern | |||
connection = httplib.HTTPConnection(PROXY) | |||
BASE_URL = 'http://%s' % SITE | |||
#------------------------------------------------------------------------ | |||
DO = 'GET' | |||
URL = BASE_URL + '/' | |||
headers = copy(get_headers) | |||
request(DO, URL, None, headers) | |||
#------------------------------------------------------------------------ | |||
DO = 'GET' | |||
URL = BASE_URL + '/wp/index.jsp' | |||
headers = copy(get_headers) | |||
r = request(DO, URL, None, headers) | |||
m = re.search('JSESSIONID=(.*);', r['response_header'].msg.__dict__['dict']['set-cookie']) | |||
if m: | |||
print m.group(1) | |||
JSESSIONID = m.group(1) | |||
print JSESSIONID | |||
#--------------------------------------------------------------------- | |||
DO = 'POST' | |||
URL = BASE_URL + '/wp/busSearch.do;jsessionid=%s' % JSESSIONID | |||
headers = copy(post_headers) | |||
form_data = { | |||
'subscriberName' : pattern, | |||
'state' : state, | |||
'suburb' : '', | |||
'street' : '', | |||
'Search' : 'Search' | |||
} | |||
params = urllib.urlencode(form_data) | |||
headers['Content-Length'] = len(params) | |||
headers['Cookie'] = 'JSESSIONID=%s' % JSESSIONID | |||
r = request(DO, URL, params, headers) | |||
xml = parse('log/%04d.bdy' % r['idx']) | |||
results += xml | |||
body = r['response_body'] | |||
m = next_page_pat.findall(body) | |||
cnt = 0 | |||
if m: | |||
no_pages = len(m) | |||
if no_pages > 0: | |||
for i in range(no_pages): | |||
print m[i] | |||
#print m[i].group(1) | |||
cnt = no_pages - 1 | |||
#--------------------------------------------------------------------- | |||
for i in range(cnt): | |||
URL = BASE_URL + '/wp/busSearch.do' | |||
form_data = { | |||
'subscriberName' : pattern, | |||
'state' : state, | |||
'page' : i + 1 | |||
} | |||
params = urllib.urlencode(form_data) | |||
headers['Content-Length'] = len(params) | |||
headers['Cookie'] = 'JSESSIONID=%s' % JSESSIONID | |||
r = request(DO, URL, params, headers) | |||
xml = parse('log/%04d.bdy' % r['idx']) | |||
results += xml | |||
ofh = open('results.xml', 'a+') | |||
ofh.write(results) | |||
ofh.close() | |||
#------------------------------------------------------------------------------- | |||
def request(method, url, params, headers): | |||
global idx | |||
print '>>>> %s %s <<<<' % (method, url) | |||
connection.request(method, url, params, headers) | |||
resp = connection.getresponse() | |||
log_req_header(idx, headers) | |||
log_resp_header(idx, resp) | |||
resp_body = resp.read() | |||
log_resp_body(idx, resp_body) | |||
# print resp_body | |||
r = {'idx' : idx, 'request_header' : headers, 'response_header' : resp, 'response_body' : resp_body} | |||
idx += 1 | |||
return r | |||
#------------------------------------------------------------------------------- | |||
def process(search_patterns): | |||
global next_page_pat | |||
next_page_pat = re.compile(r';(page=[0-9]*">[0-9]*<\/a>)') | |||
for search_pattern in search_patterns: | |||
do(search_pattern, 'VIC') | |||
#------------------------------------------------------------------------------- | |||
def searches(): | |||
searches = [] | |||
for i in range(4): | |||
ch = chr(ord('W') + i) | |||
for x in ('a', 'e', 'i', 'o', 'u'): | |||
pat = ch + x | |||
searches.append(pat) | |||
return searches | |||
#------------------------------------------------------------------------------- | |||
def used(): | |||
searches = [] | |||
searches.append('Zeus') | |||
#------------------------------------------------------------------------------- | |||
def main(): | |||
searches = [] | |||
searches.append('Zany') | |||
searches.append('Zan') | |||
searches.append('Zen') | |||
searches.append('Zend') | |||
process(searches) | |||
#------------------------------------------------------------------------------- | |||
main() | |||
#------------------------------------------------------------------------------- | |||
</pre> | |||
==wp_parse.py== | |||
<pre> | |||
#!/usr/bin/env python | |||
import pprint | |||
from BeautifulSoup import BeautifulSoup | |||
#------------------------------------------------------------------------------------- | |||
def parse(fname): | |||
doc = open(fname, 'r') | |||
soup = BeautifulSoup(doc) | |||
# print len(soup('table', { "class" : "table_style"})) | |||
# tables = soup.findAll('table', { "class" : "table_style"}) | |||
objs = soup.findAll('div', { "class" : "encap_result"}) | |||
pp = pprint.PrettyPrinter(3) | |||
xml = '' | |||
for obj in objs: | |||
t = obj.find(text=True) | |||
if t: | |||
xml += '<entry>\n' | |||
#print '[[%s]]\n\n' % obj.__dict__ | |||
# print '[[%s]]\n\n' % obj | |||
f = obj.findAll('span', { 'class' : 'black'}) | |||
for s in f: | |||
xml += ' <tag>%s</tag>\n' % s.find(text=True) | |||
f = obj.findAll('input', { "name" : 'placeName'}) | |||
for s in f: | |||
# pp.pprint(s.__dict__) | |||
# print 'attrMap -> "%s"' % s.attrMap | |||
xml += ' <placeName>%s</placeName>\n' % s.attrMap['value'] | |||
for s in obj.findAll('input', { "name" : 'subscriberName'}): | |||
xml += ' <subscriberName>%s</subscriberName>\n' % s.attrMap['value'] | |||
for s in obj.findAll('input', { "name" : 'address'}): | |||
xml += ' <address>%s</address>\n' % s.attrMap['value'] | |||
for s in obj.findAll('input', { "name" : 'streetNumber'}): | |||
xml += ' <streetNumber>%s</streetNumber>\n' % s.attrMap['value'] | |||
for s in obj.findAll('input', { "name" : 'streetName'}): | |||
xml += ' <streetName>%s</streetName>\n' % s.attrMap['value'] | |||
for s in obj.findAll('input', { "name" : 'streetType'}): | |||
xml += ' <streetType>%s</streetType>\n' % s.attrMap['value'] | |||
for s in obj.findAll('input', { "name" : 'locality'}): | |||
xml += ' <locality>%s</locality>\n' % s.attrMap['value'] | |||
lis = obj.findAll('li', { "class" : None}) | |||
for li in lis: | |||
xml += ' <li>%s</li>\n' % li.find(text=True) | |||
for address in obj.findAll('li', { "class" : "entryData address"}): | |||
xml += ' <addr>%s</addr>\n' % address.find(text=True) | |||
for phone in obj.findAll('li', { "class" : "entryData phoneNumber"}): | |||
xml += ' <phoneNumber>%s</phoneNumber>\n' % phone.find(text=True) | |||
xml += '</entry>\n\n' | |||
return xml | |||
#------------------------------------------------------------------------------------- | |||
def test(): | |||
# xml = parse('html/0002.html') | |||
# print xml | |||
#------------------------------------------------------------------------------------- | |||
</pre> | |||
=Results of Search= | |||
Are written to a file as XML. | |||
[[Category:Python]] | |||
[[Category:Python httplib]] | |||
[[Category:Internet]] |
Latest revision as of 15:07, 1 August 2015
Searching WhitePages
The following Python script uses HTTPLib to search against whitepages.com.au (via a proxy).
#!/usr/bin/env python # # #------------------------------------------------------------------------------- import re import sys import base64 import pprint import urllib import httplib from copy import copy #------------------------------------------------------------------------------- PROXY = 'PROXY:8080' SITE = 'www.whitepages.com.au' connection = None #===== Headers ================================================================= # 'Accept' : 'text/plain, text/html', get_headers = { 'Accept-Encoding' : 'gzip, deflate', 'Accept' : '*/*', 'Accept-Language' : 'en-au', 'Host' : SITE, 'Connection' : 'Keep-Alive', 'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727' } post_headers = { 'Content-type' : 'application/x-www-form-urlencoded', 'Accept' : 'text/plain' } idx = 0 #===== Logging ================================================================= def log_req_header(idx, hdr): of = open('log/%04d.req' % idx, 'w') of.write("%s\n" % pprint.pformat(hdr)) of.close() #------------------------------------------------------------------------------- def log_resp_header(idx, resp): of = open('log/%04d.hdr' % idx, 'w') of.write("resp.__dict__ ->\n%s\n\n" % pprint.pformat(resp.__dict__)) of.write("Status %s Reason [%s]\n" % (resp.status, resp.reason)) of.write("Msg ->\n%s\n\n" % resp.msg) of.write("Msg.__dict__ ->\n%s\n\n" % pprint.pformat(resp.msg.__dict__)) of.close() #------------------------------------------------------------------------------- def log_resp_body(idx, resp_body): of = open('log/%04d.bdy' % idx, 'w') of.write(resp_body); of.close() #===== Encapsulate the request code ============================================ def request(method, url, params, headers): global idx print '>>>> %s %s <<<<' % (method, url) connection.request(method, url, params, headers) resp = connection.getresponse() log_req_header(idx, headers) log_resp_header(idx, resp) resp_body = resp.read() log_resp_body(idx, resp_body) print resp_body idx += 1 return resp #=============================================================================== def do(): global connection connection = httplib.HTTPConnection(PROXY) BASE_URL = 'http://%s' % SITE #------------------------------------------------------------------------ DO = 'GET' URL = BASE_URL + '/' headers = copy(get_headers) request(DO, URL, None, headers) #------------------------------------------------------------------------ DO = 'GET' URL = BASE_URL + '/wp/index.jsp' headers = copy(get_headers) resp = request(DO, URL, None, headers) m = re.search('JSESSIONID=(.*);', resp.msg.__dict__['dict']['set-cookie']) if m: print m.group(1) JSESSIONID = m.group(1) print JSESSIONID #--------------------------------------------------------------------- DO = 'POST' URL = BASE_URL + '/wp/busSearch.do;jsessionid=%s' % JSESSIONID headers = copy(post_headers) form_data = { 'subscriberName' : 'Hard', 'state' : 'VIC', 'suburb' : '', 'street' : '', 'Search' : 'Search' } params = urllib.urlencode(form_data) headers['Content-Length'] = len(params) headers['Cookie'] = 'JSESSIONID=%s' % JSESSIONID request(DO, URL, params, headers) #--------------------------------------------------------------------- URL = BASE_URL + '/wp/busSearch.do' form_data = { 'subscriberName' : 'Hard', 'state' : 'VIC', 'page' : '2' } params = urllib.urlencode(form_data) headers['Content-Length'] = len(params) headers['Cookie'] = 'JSESSIONID=%s' % JSESSIONID request(DO, URL, params, headers) #=============================================================================== do() #-------------------------------------------------------------------------------
This script writes the search results into files (page 1 => 'log/0002.bdy' and page 2 => 'log/0003.bdy'). Amend the above code handle more pages of search results being produced.
Parsing the Search Results
The search results HTML looks as follows:
<div class="encap_result" id="result-10"><ul><li id='res10-ln0'><h4><span class='blackboldcaps'>Hard ...</ul></li></ul><div class="clearMe"> </div></div><div class="encap_result" id="result-11"><ul><li>
Results of Search
Are wriiten to a file in a block of XML.
A more Complex Script
This script loops through all the available pages in the search results and parses out the search results using the BeautifulSoup HTTP parsing module.
wp_get.py
#!/usr/bin/env python # # #------------------------------------------------------------------------------- import re import sys import base64 import pprint import urllib import httplib from copy import copy #------------------------------------------------------------------------------- from wp_parser import parse #------------------------------------------------------------------------------- PROXY = 'PROXY:8080' SITE = 'www.whitepages.com.au' connection = None next_page_pat = None results = '' #------------------------------------------------------------------------------- # 'Accept' : 'text/plain, text/html', get_headers = { 'Accept-Encoding' : 'gzip, deflate', 'Accept' : '*/*', 'Accept-Language' : 'en-au', 'Host' : SITE, 'Connection' : 'Keep-Alive', 'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727' } post_headers = { 'Content-type' : 'application/x-www-form-urlencoded', 'Accept' : 'text/plain' } idx = 0 #------------------------------------------------------------------------------- def log_req_header(idx, hdr): of = open('log/%04d.req' % idx, 'w') of.write("%s\n" % pprint.pformat(hdr)) of.close() #------------------------------------------------------------------------------- def log_resp_header(idx, resp): of = open('log/%04d.hdr' % idx, 'w') of.write("resp.__dict__ ->\n%s\n\n" % pprint.pformat(resp.__dict__)) of.write("Status %s Reason [%s]\n" % (resp.status, resp.reason)) of.write("Msg ->\n%s\n\n" % resp.msg) of.write("Msg.__dict__ ->\n%s\n\n" % pprint.pformat(resp.msg.__dict__)) of.close() #------------------------------------------------------------------------------- def log_resp_body(idx, resp_body): of = open('log/%04d.bdy' % idx, 'w') of.write(resp_body); of.close() #------------------------------------------------------------------------------- def do(pattern, state): global connection global results print "Pattern ==> '%s'" % pattern connection = httplib.HTTPConnection(PROXY) BASE_URL = 'http://%s' % SITE #------------------------------------------------------------------------ DO = 'GET' URL = BASE_URL + '/' headers = copy(get_headers) request(DO, URL, None, headers) #------------------------------------------------------------------------ DO = 'GET' URL = BASE_URL + '/wp/index.jsp' headers = copy(get_headers) r = request(DO, URL, None, headers) m = re.search('JSESSIONID=(.*);', r['response_header'].msg.__dict__['dict']['set-cookie']) if m: print m.group(1) JSESSIONID = m.group(1) print JSESSIONID #--------------------------------------------------------------------- DO = 'POST' URL = BASE_URL + '/wp/busSearch.do;jsessionid=%s' % JSESSIONID headers = copy(post_headers) form_data = { 'subscriberName' : pattern, 'state' : state, 'suburb' : '', 'street' : '', 'Search' : 'Search' } params = urllib.urlencode(form_data) headers['Content-Length'] = len(params) headers['Cookie'] = 'JSESSIONID=%s' % JSESSIONID r = request(DO, URL, params, headers) xml = parse('log/%04d.bdy' % r['idx']) results += xml body = r['response_body'] m = next_page_pat.findall(body) cnt = 0 if m: no_pages = len(m) if no_pages > 0: for i in range(no_pages): print m[i] #print m[i].group(1) cnt = no_pages - 1 #--------------------------------------------------------------------- for i in range(cnt): URL = BASE_URL + '/wp/busSearch.do' form_data = { 'subscriberName' : pattern, 'state' : state, 'page' : i + 1 } params = urllib.urlencode(form_data) headers['Content-Length'] = len(params) headers['Cookie'] = 'JSESSIONID=%s' % JSESSIONID r = request(DO, URL, params, headers) xml = parse('log/%04d.bdy' % r['idx']) results += xml ofh = open('results.xml', 'a+') ofh.write(results) ofh.close() #------------------------------------------------------------------------------- def request(method, url, params, headers): global idx print '>>>> %s %s <<<<' % (method, url) connection.request(method, url, params, headers) resp = connection.getresponse() log_req_header(idx, headers) log_resp_header(idx, resp) resp_body = resp.read() log_resp_body(idx, resp_body) # print resp_body r = {'idx' : idx, 'request_header' : headers, 'response_header' : resp, 'response_body' : resp_body} idx += 1 return r #------------------------------------------------------------------------------- def process(search_patterns): global next_page_pat next_page_pat = re.compile(r';(page=[0-9]*">[0-9]*<\/a>)') for search_pattern in search_patterns: do(search_pattern, 'VIC') #------------------------------------------------------------------------------- def searches(): searches = [] for i in range(4): ch = chr(ord('W') + i) for x in ('a', 'e', 'i', 'o', 'u'): pat = ch + x searches.append(pat) return searches #------------------------------------------------------------------------------- def used(): searches = [] searches.append('Zeus') #------------------------------------------------------------------------------- def main(): searches = [] searches.append('Zany') searches.append('Zan') searches.append('Zen') searches.append('Zend') process(searches) #------------------------------------------------------------------------------- main() #-------------------------------------------------------------------------------
wp_parse.py
#!/usr/bin/env python import pprint from BeautifulSoup import BeautifulSoup #------------------------------------------------------------------------------------- def parse(fname): doc = open(fname, 'r') soup = BeautifulSoup(doc) # print len(soup('table', { "class" : "table_style"})) # tables = soup.findAll('table', { "class" : "table_style"}) objs = soup.findAll('div', { "class" : "encap_result"}) pp = pprint.PrettyPrinter(3) xml = '' for obj in objs: t = obj.find(text=True) if t: xml += '<entry>\n' #print '[[%s]]\n\n' % obj.__dict__ # print '[[%s]]\n\n' % obj f = obj.findAll('span', { 'class' : 'black'}) for s in f: xml += ' <tag>%s</tag>\n' % s.find(text=True) f = obj.findAll('input', { "name" : 'placeName'}) for s in f: # pp.pprint(s.__dict__) # print 'attrMap -> "%s"' % s.attrMap xml += ' <placeName>%s</placeName>\n' % s.attrMap['value'] for s in obj.findAll('input', { "name" : 'subscriberName'}): xml += ' <subscriberName>%s</subscriberName>\n' % s.attrMap['value'] for s in obj.findAll('input', { "name" : 'address'}): xml += ' <address>%s</address>\n' % s.attrMap['value'] for s in obj.findAll('input', { "name" : 'streetNumber'}): xml += ' <streetNumber>%s</streetNumber>\n' % s.attrMap['value'] for s in obj.findAll('input', { "name" : 'streetName'}): xml += ' <streetName>%s</streetName>\n' % s.attrMap['value'] for s in obj.findAll('input', { "name" : 'streetType'}): xml += ' <streetType>%s</streetType>\n' % s.attrMap['value'] for s in obj.findAll('input', { "name" : 'locality'}): xml += ' <locality>%s</locality>\n' % s.attrMap['value'] lis = obj.findAll('li', { "class" : None}) for li in lis: xml += ' <li>%s</li>\n' % li.find(text=True) for address in obj.findAll('li', { "class" : "entryData address"}): xml += ' <addr>%s</addr>\n' % address.find(text=True) for phone in obj.findAll('li', { "class" : "entryData phoneNumber"}): xml += ' <phoneNumber>%s</phoneNumber>\n' % phone.find(text=True) xml += '</entry>\n\n' return xml #------------------------------------------------------------------------------------- def test(): # xml = parse('html/0002.html') # print xml #-------------------------------------------------------------------------------------
Results of Search
Are written to a file as XML.