Parsing WhitePages Search Results HTML

From PeformIQ Upgrade
Revision as of 11:27, 25 January 2008 by PeterHarding (talk | contribs)
Jump to navigation Jump to search

Searching WhitePages

The following Python script uses HTTPLib to search against whitepages.com.au (via a proxy).

 #!/usr/bin/env python
 #
 #
 #-------------------------------------------------------------------------------
 
 import re
 import sys
 import base64
 import pprint
 import urllib
 import httplib
 
 from copy import copy
 
 #-------------------------------------------------------------------------------
 
 PROXY       = 'PROXY:8080'
 SITE        = 'www.whitepages.com.au'
 
 connection  = None
 
 #===== Headers =================================================================
 
 #    'Accept' : 'text/plain, text/html',
 
 get_headers = {
    'Accept-Encoding'    : 'gzip, deflate',
    'Accept'             : '*/*',
    'Accept-Language'    : 'en-au',
    'Host'               : SITE,
    'Connection'         : 'Keep-Alive',
    'User-Agent'         : 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727'
 }
 
 post_headers = {
    'Content-type'       : 'application/x-www-form-urlencoded',
    'Accept'             : 'text/plain'
 }
 
 idx      = 0
 
 #===== Logging =================================================================
 
 def log_req_header(idx, hdr):
    of = open('log/%04d.req' % idx, 'w')
    of.write("%s\n" % pprint.pformat(hdr))
    of.close()
 
 #-------------------------------------------------------------------------------
 
 def log_resp_header(idx, resp):
    of = open('log/%04d.hdr' % idx, 'w')
 
    of.write("resp.__dict__ ->\n%s\n\n" % pprint.pformat(resp.__dict__))
    of.write("Status %s  Reason [%s]\n" % (resp.status, resp.reason))
    of.write("Msg ->\n%s\n\n" % resp.msg)
    of.write("Msg.__dict__ ->\n%s\n\n" % pprint.pformat(resp.msg.__dict__))
 
    of.close()
 
 #-------------------------------------------------------------------------------
 
 def log_resp_body(idx, resp_body):
    of = open('log/%04d.bdy' % idx, 'w')
    of.write(resp_body);
    of.close()
 
 #===== Encapsulate the request code ============================================
 
 def request(method, url, params, headers):
    global idx
 
    print '>>>> %s %s <<<<' % (method, url)
 
    connection.request(method, url, params, headers)
 
    resp = connection.getresponse()
 
    log_req_header(idx, headers)
    log_resp_header(idx, resp)
 
    resp_body = resp.read()
 
    log_resp_body(idx, resp_body)
 
    print resp_body
 
    idx += 1
 
    return resp
 
 #===============================================================================
 
 def do():
    global connection
    connection  = httplib.HTTPConnection(PROXY)
 
    BASE_URL    = 'http://%s' % SITE
 
    #------------------------------------------------------------------------
 
    DO       = 'GET'
    URL      = BASE_URL + '/'
 
    headers = copy(get_headers)
 
    request(DO, URL, None, headers)
 
 
    #------------------------------------------------------------------------
 
    DO  = 'GET'
    URL = BASE_URL + '/wp/index.jsp'
 
    headers = copy(get_headers)
 
    resp = request(DO, URL, None, headers)
 
    m = re.search('JSESSIONID=(.*);', resp.msg.__dict__['dict']['set-cookie'])
 
    if m:
       print m.group(1)
       JSESSIONID = m.group(1)
 
    print JSESSIONID
 
    #---------------------------------------------------------------------
 
    DO  = 'POST'
    URL = BASE_URL + '/wp/busSearch.do;jsessionid=%s' % JSESSIONID
 
    headers = copy(post_headers)
 
    form_data = {
       'subscriberName' : 'Hard',
       'state'          : 'VIC',
       'suburb'         : '',
       'street'         : '',
       'Search'         : 'Search'
    }
 
    params = urllib.urlencode(form_data)
 
    headers['Content-Length'] = len(params)
    headers['Cookie']         = 'JSESSIONID=%s' % JSESSIONID
 
    request(DO, URL, params, headers)
 
    #---------------------------------------------------------------------
 
    URL = BASE_URL + '/wp/busSearch.do'
 
    form_data = {
       'subscriberName' : 'Hard',
       'state'          : 'VIC',
       'page'           : '2'
    }
 
    params = urllib.urlencode(form_data)
 
    headers['Content-Length'] = len(params)
    headers['Cookie']         = 'JSESSIONID=%s' % JSESSIONID
 
    request(DO, URL, params, headers)
  
 #===============================================================================
 
 do()
 
 #-------------------------------------------------------------------------------

This scripts writes the search results into files (page 1 => 'log/0002.bdy' and page 2 => 'log/0003.bdy'). Amend the above code handle more pages of search results being produced.

Parsing the Search Results

The search results HTML looks as follows:

 <div class="encap_result" id="result-10"><ul><li id='res10-ln0'><h4><span class='blackboldcaps'>Hard On Tools</span></h4></li><li class='subMultiContainer' id='res10-ln1'><ul><li class='entryData address'>5 Scoresby Rd Bayswater 3153</li><li class='entryData phoneNumber'><span class='blackboldcaps'>(03) 9720 5199</span></li><li class='entryData whereIsMap'><form class='mapForm' action='whereIs.do' method='post'><input type='hidden' name='streetNumber' value='5' /><input type='hidden' name='streetName' value='Scoresby' /><input type='hidden' name='streetType' value='Rd' /><input type='hidden' name='locality' value='Bayswater' /><input type='hidden' name='state' value='VIC' /><input type='hidden' name='placeName' value='Hard On Tools' /><input type='hidden' name='address' value='5 Scoresby Rd Bayswater 3153' /><input type='hidden' name='phoneNumber' value='(03) 9720 5199' /><input type='hidden' name='link' value='1197346075688' /><input type='hidden' name='hashCode' value='152335231501250' /><input type='hidden' name='brandId' value='5' /><input type='hidden' name='logData' value='' /><input type='hidden' name='subscriberName' value='Hard On Tools' /><input name='mapSubmit' src='images/result-map.gif' alt='Map' type='image'  /></form></li></ul></li><li class='subMultiContainer' id='res10-ln2'><ul><li class='entryData address'><h5><span class='black'>OR...</span></h5></li><li class='entryData phoneNumber'>(03) 9738 2882</li></ul></li><li class='subMultiContainer' id='res10-ln3'><ul><li class='entryData address indent2'><h5><span class='black'>Fax</span></h5></li><li class='entryData phoneNumber'>(03) 9720 0966</li></ul></li></ul><div class="clearMe"> </div></div><div class="encap_result" id="result-11"><ul><li><h4><span class='black'>Hard Parts Australia Pty Ltd</span></h4></li>
<li class='entryData address'>14 Yiannis Crt Springvale 3171</li><li class='entryData phoneNumber'>0418 756 340</li><li class='entryData whereIsMap'><form class='mapForm' action='whereIs.do' method='post'><input type='hidden' name='streetNumber' value='14' /><input type='hidden' name='streetName' value='Yiannis' /><input type='hidden' name='streetType' value='Crt' /><input type='hidden' name='locality' value='Springvale' /><input type='hidden' name='state' value='VIC' /><input type='hidden' name='placeName' value='Hard Parts Australia Pty Ltd' /><input type='hidden' name='address' value='14 Yiannis Crt Springvale 3171' /><input type='hidden' name='phoneNumber' value='0418 756 340' /><input type='hidden' name='link' value='1197346075688' /><input type='hidden' name='hashCode' value='152335231501250' /><input type='hidden' name='brandId' value='5' /><input type='hidden' name='logData' value='' /><input type='hidden' name='subscriberName' value='Hard Parts Australia Pty Ltd' /><input name='mapSubmit' src='images/result-map.gif' alt='Map' type='image'  /></form></li></ul><div class="clearMe"> </div></div><div class="encap_result" id="result-12"><ul><li><h4><span class='black'>Hard Parts Victoria</span></h4></li>


This following script, which makes use of the BeautifulSoup package, is a first cut of at parsing out the useful data from the WhitePages results HTML (see here for BeautifulSoap home page);

 #!/usr/bin/env python
 
 import pprint
 
 from BeautifulSoup import BeautifulSoup
 
 doc = open('log/0002.html', 'r')
 
 soup = BeautifulSoup(doc)
 
 # print len(soup('table', { "class" : "table_style"}))
 
 # tables = soup.findAll('table', { "class" : "table_style"})
 objs = soup.findAll('div', { "class" : "encap_result"})
 
 pp = pprint.PrettyPrinter(3)
 
 for obj in objs:
    t = obj.find(text=True)
 
    if t:
       print "==========================================="
       # print t
 
    #print '[[%s]]\n\n' % obj.__dict__
    # print '[[%s]]\n\n' % obj
 
       f    = obj.findAll('span',  { 'class' : 'black'})
 
       for s in f:
          print 'span="black" -> "%s"' % s.find(text=True)
 
       f    = obj.findAll('input',  { "name" : 'placeName'})
 
       for s in f:
          # pp.pprint(s.__dict__)
          # print 'attrMap -> "%s"' % s.attrMap
          print 'placeName -> "%s"' % s.attrMap['value']
 
 
       for s in obj.findAll('input',  { "name" : 'placeName'}):
          print 'placeName -> "%s"' % s.attrMap['value']
 
       for s in obj.findAll('input',  { "name" : 'address'}):
          print 'address -> "%s"' % s.attrMap['value']
 
       for s in obj.findAll('input',  { "name" : 'locality'}):
          print 'locality -> "%s"' % s.attrMap['value']
 
       for s in obj.findAll('input',  { "name" : 'streetNumber'}):
          print 'streetNumber -> "%s"' % s.attrMap['value']
 
       for s in obj.findAll('input',  { "name" : 'streetName'}):
          print 'streetName -> "%s"' % s.attrMap['value']
 
       for s in obj.findAll('input',  { "name" : 'streetType'}):
          print 'streetType -> "%s"' % s.attrMap['value']
 
 
       lis       = obj.findAll('li',  { "class" : None})
 
       for li in lis:
          print 'li -> "%s"' % li.find(text=True)
 
       addresses      = obj.findAll('li',  { "class" : "entryData address"})
 
       for address in addresses:
          print 'addr -> "%s"' % address.find(text=True)
 
       phone_numbers  = obj.findAll('li',  { "class" : "entryData phone"})
 
       for phone in phone_numbers:
          print 'phone -> "%s"' % phone.find(text=True)