Parsing WhitePages Search Results HTML

From PeformIQ Upgrade
Revision as of 11:14, 13 December 2007 by PeterHarding (talk | contribs)
Jump to navigation Jump to search

Searching WhitePages

The following Python script uses HTTPLib to search against whitepages.com.au (via a proxy).

#!/usr/bin/env python
#
#
#-------------------------------------------------------------------------------

import re
import sys
import base64
import pprint
import urllib
import httplib

from copy import copy

#-------------------------------------------------------------------------------

PROXY       = 'PROXY:8080'
SITE        = 'www.whitepages.com.au'

connection  = None

#===== Headers =================================================================
#    'Accept' : 'text/plain, text/html',

get_headers = {
   'Accept-Encoding'    : 'gzip, deflate',
   'Accept'             : '*/*',
   'Accept-Language'    : 'en-au',
   'Host'               : SITE,
   'Connection'         : 'Keep-Alive',
   'User-Agent'         :  'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727'
}

post_headers = {
   'Content-type'     : 'application/x-www-form-urlencoded',
   'Accept'           : 'text/plain'
}

idx      = 0

#===== Logging =================================================================

def log_req_header(idx, hdr):
   of = open('log/%04d.req' % idx, 'w')
   of.write("%s\n" % pprint.pformat(hdr))
   of.close()

#-------------------------------------------------------------------------------

def log_resp_header(idx, resp):
   of = open('log/%04d.hdr' % idx, 'w')

   of.write("resp.__dict__ ->\n%s\n\n" % pprint.pformat(resp.__dict__))
   of.write("Status %s  Reason [%s]\n" % (resp.status, resp.reason))
   of.write("Msg ->\n%s\n\n" % resp.msg)
   of.write("Msg.__dict__ ->\n%s\n\n" % pprint.pformat(resp.msg.__dict__))

   of.close()

#-------------------------------------------------------------------------------

def log_resp_body(idx, resp_body):
   of = open('log/%04d.bdy' % idx, 'w')
   of.write(resp_body);
   of.close()

#===== Encapsulate the request code ============================================

def request(method, url, params, headers):
   global idx

   print '>>>> %s %s <<<<' % (method, url)

   connection.request(method, url, params, headers)

   resp = connection.getresponse()

   log_req_header(idx, headers)
   log_resp_header(idx, resp)

   resp_body = resp.read()

   log_resp_body(idx, resp_body)

   print resp_body

   idx += 1

   return resp
#===============================================================================

def do():
   global connection
   connection  = httplib.HTTPConnection(PROXY)

   BASE_URL    = 'http://%s' % SITE

   #------------------------------------------------------------------------

   DO       = 'GET'
   URL      = BASE_URL + '/'

   headers = copy(get_headers)

   request(DO, URL, None, headers)


   #------------------------------------------------------------------------

   DO  = 'GET'
   URL = BASE_URL + '/wp/index.jsp'

   headers = copy(get_headers)

   resp = request(DO, URL, None, headers)

   m = re.search('JSESSIONID=(.*);', resp.msg.__dict__['dict']['set-cookie'])

   if m:
      print m.group(1)
      JSESSIONID = m.group(1)

   print JSESSIONID

   #---------------------------------------------------------------------

   DO  = 'POST'
   URL = BASE_URL + '/wp/busSearch.do;jsessionid=%s' % JSESSIONID

   headers = copy(post_headers)

   form_data = {
      'subscriberName' : 'Hard',
      'state'          : 'VIC',
      'suburb'         : ,
      'street'         : ,
      'Search'         : 'Search'
   }

   params = urllib.urlencode(form_data)

   headers['Content-Length'] = len(params)
   headers['Cookie']         = 'JSESSIONID=%s' % JSESSIONID

   request(DO, URL, params, headers)

   #---------------------------------------------------------------------

   URL = BASE_URL + '/wp/busSearch.do'

   form_data = {
      'subscriberName' : 'Hard',
      'state'          : 'VIC',
      'page'           : '2'
   }

   params = urllib.urlencode(form_data)

   headers['Content-Length'] = len(params)
   headers['Cookie']         = 'JSESSIONID=%s' % JSESSIONID

   request(DO, URL, params, headers)
 
#===============================================================================

do()

#-------------------------------------------------------------------------------

This scripts writes the search results into files (page 1 => 'log/0002.bdy' and page 2 => 'log/0003.bdy'). Amend the above code handle more pages of search results being produced.

Parsing the Search Results

The search results HTML looks as follows:

  • Hard On Tools

    • 5 Scoresby Rd Bayswater 3153
    • (03) 9720 5199
    • <form class='mapForm' action='whereIs.do' method='post'><input type='hidden' name='streetNumber' value='5' /><input type='hidden' name='streetName' value='Scoresby' /><input type='hidden' name='streetType' value='Rd' /><input type='hidden' name='locality' value='Bayswater' /><input type='hidden' name='state' value='VIC' /><input type='hidden' name='placeName' value='Hard On Tools' /><input type='hidden' name='address' value='5 Scoresby Rd Bayswater 3153' /><input type='hidden' name='phoneNumber' value='(03) 9720 5199' /><input type='hidden' name='link' value='1197346075688' /><input type='hidden' name='hashCode' value='152335231501250' /><input type='hidden' name='brandId' value='5' /><input type='hidden' name='logData' value= /><input type='hidden' name='subscriberName' value='Hard On Tools' /><input name='mapSubmit' src='images/result-map.gif' alt='Map' type='image' /></form>
    • OR...
    • (03) 9738 2882
    • Fax
    • (03) 9720 0966
 
  • Hard Parts Australia Pty Ltd

  • 14 Yiannis Crt Springvale 3171
  • 0418 756 340
  • <form class='mapForm' action='whereIs.do' method='post'><input type='hidden' name='streetNumber' value='14' /><input type='hidden' name='streetName' value='Yiannis' /><input type='hidden' name='streetType' value='Crt' /><input type='hidden' name='locality' value='Springvale' /><input type='hidden' name='state' value='VIC' /><input type='hidden' name='placeName' value='Hard Parts Australia Pty Ltd' /><input type='hidden' name='address' value='14 Yiannis Crt Springvale 3171' /><input type='hidden' name='phoneNumber' value='0418 756 340' /><input type='hidden' name='link' value='1197346075688' /><input type='hidden' name='hashCode' value='152335231501250' /><input type='hidden' name='brandId' value='5' /><input type='hidden' name='logData' value= /><input type='hidden' name='subscriberName' value='Hard Parts Australia Pty Ltd' /><input name='mapSubmit' src='images/result-map.gif' alt='Map' type='image' /></form>
 
  • Hard Parts Victoria



  • This script makes use of the BeautifulSoup Package (see [xxx]);

    #!/usr/bin/env python
    
    import pprint
    
    from BeautifulSoup import BeautifulSoup
    
    doc = open('log/0002.html', 'r')
    
    soup = BeautifulSoup(doc)
    
    # print len(soup('table', { "class" : "table_style"}))
    
    # tables = soup.findAll('table', { "class" : "table_style"})
    objs = soup.findAll('div', { "class" : "encap_result"})
    
    pp = pprint.PrettyPrinter(3)
    
    for obj in objs:
       t = obj.find(text=True)
    
       if t:
          print "==========================================="
          # print t
    
       #print '%s\n\n' % obj.__dict__
       # print '%s\n\n' % obj
    
          f    = obj.findAll('span',  { 'class' : 'black'})
    
          for s in f:
             print 'span="black" -> "%s"' % s.find(text=True)
    
          f    = obj.findAll('input',  { "name" : 'placeName'})
    
          for s in f:
             # pp.pprint(s.__dict__)
             # print 'attrMap -> "%s"' % s.attrMap
             print 'placeName -> "%s"' % s.attrMap['value']
    
    
          for s in obj.findAll('input',  { "name" : 'placeName'}):
             print 'placeName -> "%s"' % s.attrMap['value']
    
          for s in obj.findAll('input',  { "name" : 'address'}):
             print 'address -> "%s"' % s.attrMap['value']
    
          for s in obj.findAll('input',  { "name" : 'locality'}):
             print 'locality -> "%s"' % s.attrMap['value']
    
          for s in obj.findAll('input',  { "name" : 'streetNumber'}):
             print 'streetNumber -> "%s"' % s.attrMap['value']
    
          for s in obj.findAll('input',  { "name" : 'streetName'}):
             print 'streetName -> "%s"' % s.attrMap['value']
    
          for s in obj.findAll('input',  { "name" : 'streetType'}):
             print 'streetType -> "%s"' % s.attrMap['value']
    
    
          lis       = obj.findAll('li',  { "class" : None})
    
          for li in lis:
             print 'li -> "%s"' % li.find(text=True)
    
          addresses      = obj.findAll('li',  { "class" : "entryData address"})
    
          for address in addresses:
             print 'addr -> "%s"' % address.find(text=True)
    
          phone_numbers  = obj.findAll('li',  { "class" : "entryData phone"})
    
          for phone in phone_numbers:
             print 'phone -> "%s"' % phone.find(text=True)