Examples

Whitepages

See script which page scrape search results off Whitepages site...

File Download Example

#!/usr/bin/env python
#
#
#
#-------------------------------------------------------------------------------

import re
import sys
import urllib
import httplib
import binascii

#-------------------------------------------------------------------------------

SITE   = 'host'
URL    = '/url/login'

params  = urllib.urlencode({'aaa' : 1})

get_headers = {
   'Accept-Language'  : 'en-au',
   'Accept'           : 'text/plain',
   'Content-Type'     : 'text/html; charset=utf-8',
   'Connection'       : 'Keep-Alive',
   'Host'             : SITE,
   'User-Agent'       : 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)'
}

post_headers = {
   'Accept-Language'  : 'en-au',
   'Accept-Encoding'  : 'gzip, deflate',
   'Content-Type'     : 'application/x-www-form-urlencoded',
   'Host'             : SITE,
   'Connection'       : 'Keep-Alive',
   'Cache-Control'    : 'no-cache',
   'User-Agent'       : 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)'
}

   # 'Content-Length'   : len(request),
   # 'Cookie'           : 'JSESSIONID=%s' % JSESSIONID

#-------------------------------------------------------------------------------

def log_header(idx, resp):
   of = open('%04d.hdr' % idx, 'w')

   of.write("resp.__dict__ -> '%s'\n" % resp.__dict__)
   of.write("Status %s  Reason [%s]\n" % (resp.status, resp.reason))
   of.write("Msg -> '%s'\n" % resp.msg)
   of.write("Msg.__dict__ -> '%s'\n" % resp.msg.__dict__)

   #xxx =  "Msg.__dict__ -> '%s'" % resp.msg.__dict__['dict']['set-cookie']
   #print xxx

   of.close()

#-------------------------------------------------------------------------------

def log_body(idx, resp_body):
   of = open('%04d.bdy' % idx, 'w')
   of.write(resp_body);
   of.close()

#-------------------------------------------------------------------------------

def do():
   conn = httplib.HTTPConnection(SITE)

   #---------------------------------------------------------------------

   idx = 1

   print ">>>>> GET %s <<<<<" % URL

   conn.request("GET", '/%s/login.do' % URL, None, get_headers)

   resp = conn.getresponse()

   log_header(idx, resp)

   m = re.search('JSESSIONID=(.*);', resp.msg.__dict__['dict']['set-cookie'])

   if m:
      print m.group(1)
      JSESSIONID = m.group(1)

   resp_body = resp.read()

   log_body(idx, resp_body)

   print resp_body

   # <form action="http://xxxx:80/CustomerPortalWeb/login/login.do;jsessionid=vgp9GDVS6JyTly0v6NfsHG0rt1pLyvpMLxYnJf9MXsk3Yn0T2SZ3!1111094026" method="post">

   #---------------------------------------------------------------------

   idx = 2

   print ">>>>> POST /%s/login.do <<<<<" % URL
   print ">>>>> JSESSIONID = %s " % JSESSIONID

   URL = "/CustomerPortalWeb/login/login.do;jsessionid=%s" % JSESSIONID

#   form_data = {
#      '{actionForm.username}' : 'svtest035@svt',
#      '{actionForm.password}' : 'xxxx'
#   }

   form_data = {
      '{actionForm.username}' : 'admin',
      '{actionForm.password}' : 'xxxx'
   }

   params = urllib.urlencode(form_data)

   post_headers['Content-Length'] = len(params)
   post_headers['Cookie']         = 'JSESSIONID=%s' % JSESSIONID

   conn.request("POST", URL, params, post_headers)

   resp = conn.getresponse()

   log_header(idx, resp)

   resp_body = resp.read()

   log_body(idx, resp_body)

   #---------------------------------------------------------------------

   idx = 3

   print ">>>>> GET /%s/test.do <<<<<" % URL

   get_headers['Cookie']         = 'JSESSIONID=%s' % JSESSIONID

   conn.request("GET", '/%s/test.do' % URL, None, get_headers)

   resp = conn.getresponse()

   log_header(idx, resp)

   resp_body = resp.read()

   log_body(idx, resp_body)

   #---------------------------------------------------------------------

   idx = 4

   args    = {
                '_nfpb'      : 'true',
                '_pageLabel' : 'ImportUserPage'
             }

   ue_args = urllib.urlencode(args)

   print ">>>>> GET /%s/test.do <<<<<" % URL

   get_headers['Cookie']         = 'JSESSIONID=%s' % JSESSIONID

   conn.request("GET", '/%s/test.do?%s' % (URL, ue_args), None, get_headers)

   resp = conn.getresponse()

   log_header(idx, resp)

   resp_body = resp.read()

   log_body(idx, resp_body)

   #---------------------------------------------------------------------

   conn.close()

#-------------------------------------------------------------------------------

def main(args):
   do()

#-------------------------------------------------------------------------------

if __name__ == "__main__":
   main(sys.argv[1:])

#-------------------------------------------------------------------------------

"""
Regex Stuff:
    regex          = re.compile("\\n *")
       (name, cnt) = re.subn('esb:', '', node_name)
             value = re.sub(r'\n *', 'N/A', value)
"""

"""
  FILE DOWNLOAD:

    h.putrequest('POST', '/scripts/cgi.exe?')
    h.putheader('Content-length', '%d'%len(params))
    h.putheader('Accept', 'text/plain')
    h.putheader('Host', 'test.site.com')
    h.endheaders()
    h.send(params)
    reply, msg, hdrs = h.getreply()
    data = h.getfile().read()
    file('test.file', 'w').write(data)
    h.close()
"""

"""
Accept-Language: en-au
Content-Type: application/x-www-form-urlencoded
Accept-Encoding: gzip, deflate
User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)
Host: xxx:8080
Content-Length: 54
Connection: Keep-Alive
Cache-Control: no-cache
Cookie: JSESSIONID=xxx
"""

Modular

See here for zipped up copy of files.

action

#!/usr/bin/env python
#
#
#-------------------------------------------------------------------------------

import re
import sys

#-------------------------------------------------------------------------------

import http

#-------------------------------------------------------------------------------

SITE         = 'host'

site         = None
ASPSESSIONID = None

postcodes    = [2000, 2001, 2007, 2010, 3000, 3001, 3162, 3124, 3005, 6009, 7001]

#-------------------------------------------------------------------------------

def login():
   global ASPSESSIONID

   #------------------------------------------------------------------------

   DO  = 'GET'
   URL = '/'

   headers = http.GET_Headers(SITE)

   r = http.Request(DO, URL, headers)

   site.request(r)

   #------------------------------------------------------------------------

   DO  = 'GET'
   URL = '/login.asp'

   headers = http.GET_Headers(SITE)

   headers['Referer']        = 'http://%s/' % SITE

   r = http.Request(DO, URL, headers)

   site.request(r)

   m = re.search('ASPSESSIONIDSCBCSRCC=(.*);', r.Response_headers.msg.__dict__['dict']['set-cookie'])

   if m:
      print "ASPSESSIONID --> %s" % m.group(1)
      ASPSESSIONID = m.group(1)

   #---------------------------------------------------------------------

   DO  = 'POST'
   URL = '/Login.asp'

   headers = http.POST_Headers(SITE)

   headers['Referer']        = 'http://%s/login.asp' % SITE
   headers['Cookie']         = 'ASPSESSIONIDSCBCSRCC=%s' % ASPSESSIONID

   post_data = {
      'txtUserName' : 'xxx',
      'txtPassword' : 'xxx',
      'txtUserID'   : '0',
      'imgOK.x'     : '32',
      'imgOK.y'     : '11'
   }

   r = http.Request(DO, URL, headers, post_data)

   site.request(r)

   #---------------------------------------------------------------------

   DO  = 'POST'
   URL = '/Login.asp'

   headers = http.POST_Headers(SITE)

   headers['Referer']        = 'http://%s/login.asp' % SITE
   headers['Cookie']         = 'ASPSESSIONIDSCBCSRCC=%s' % ASPSESSIONID

   form_data = {
      'txtUserID'   : '1142',
      'imgOK.x'     : '31',
      'imgOK.y'     : '12'
   }

   r = http.Request(DO, URL, headers, post_data)

   site.request(r)

   #---------------------------------------------------------------------

   DO  = 'POST'
   URL = '/Login.asp'

   headers = http.POST_Headers(SITE)

   headers['Referer']        = 'http://%s/Login.asp' % SITE
   headers['Cookie']         = 'ASPSESSIONIDSCBCSRCC=%s' % ASPSESSIONID

   form_data = {
      'cmbDC'       : '311001%2CMETRO MAILS BUS UNIT CAP INV  ',
      'txtUserID'   : '1142',
      'imgOK2.x'     : '20',
      'imgOK2.y'     : '5'
   }

   r = http.Request(DO, URL, headers, post_data)

   site.request(r)

   #---------------------------------------------------------------------

   DO  = 'GET'
   URL = '/menuindex.asp?UserID=xxx,xxx'

   headers = http.GET_Headers(SITE)

   headers['Cookie']         = 'ASPSESSIONIDSCBCSRCC=%s' % ASPSESSIONID

   r = http.Request(DO, URL, headers)

   site.request(r)
   
   #---------------------------------------------------------------------

   DO  = 'POST'
   URL = '/Redirect.asp?UserID=xxx,xxx'

   headers = http.POST_Headers(SITE)

   headers['Referer']        = 'http://%s/Login.asp' % SITE
   headers['Cookie']         = 'ASPSESSIONIDSCBCSRCC=%s' % ASPSESSIONID

   r = http.Request(DO, URL, headers, post_data)

   site.request(r)

#------------------------------------------------------------------------

def scrape(postcode):
   DO  = 'POST'
   URL = '/Redirect.asp'

   print 'Scraping %d' % postcode

   headers = http.POST_Headers(SITE)

   headers['Referer']        = 'http://%s/Login.asp' % SITE
   headers['Cookie']         = 'ASPSESSIONIDSCBCSRCC=%s' % ASPSESSIONID

   post_data = {
      'txtCurrentUserID'      : 'xxx',
      'txtDC'                 : 'xxx',
      'txtCRN'                : '',
      'txtLastName'           : '',
      'cmbSearchTypeLastName' : '2',
      'txtAddress1'           : '',
      'cmbSearchTypeAddress1' : '2',
      'txtAddress2'           : '',
      'cmbSearchTypeAddress2' : '3',
      'txtSuburb'             : '',
      'cmbSearchTypeSuburb'   : '1',
      'txtPostcode'           : '%4d' % postcode,
      'cmbAction'             : '1',
      'imgSearch.x'           : '24',
      'imgSearch.y'           : '6'
   }

   r = http.Request(DO, URL, headers, post_data)

   site.request(r)

   return r.idx

#------------------------------------------------------------------------

def process():
   global site

   dir = 'log'

   site = http.scraper.Scraper(SITE, log_dir=dir)

   login()

   data = ''

   ofh = open('redirections.dat', 'a+')

   for postcode in postcodes:
      idx = scrape(postcode)

      rows = http.parse('%s/%04d.html' % (dir, idx))

      for row in rows:
         ofh.write("%s|%s\n" % (row, postcode))

   ofh.close()

#------------------------------------------------------------------------

def main(args):
   process()

#-------------------------------------------------------------------------------

if __name__ == "__main__":
   main(sys.argv[1:])


"""
     Date     Who  Description
  ----------  ---  -----------------------------------------------------
  2008-02-20  plh  Initial implementation

"""

http Module

init.py

from request import Request

import logger
import scraper

from parser import parse

#-------------------------------------------------------------------------------
#    'Accept' : 'text/plain, text/html',

def GET_Headers(site):
   return {
      'Accept-Encoding'    : 'gzip, deflate',
      'Accept'             : '*/*',
      'Accept-Language'    : 'en-au',
      'Host'               : '%s' % site,
      'Connection'         : 'Keep-Alive',
      'User-Agent'         :  'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727'
   }

#-------------------------------------------------------------------------------

def POST_Headers(site):
   return  {
      'Accept:'            : 'image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, application/x-shockwave-flash, */*',
      'Accept-Language:'   : 'en-au',
      'Content-Type:'      : 'application/x-www-form-urlencoded',
      'Accept-Encoding:'   : 'gzip, deflate',
      'User-Agent:'        : 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
      'Host'               : '%s',
      'Connection:'        : 'Keep-Alive',
      'Cache-Control:'     : 'no-cache',
   }

#-------------------------------------------------------------------------------

logger.py

#!/usr/bin/env python
#
#
#-------------------------------------------------------------------------------

import pprint

#-------------------------------------------------------------------------------

class Logger():
   directory = None

   #----------------------------------------------------------------------------

   def __init__(self, dir=None):
      print "[http::scraper]  dir %s" % dir

      if dir:
         self.directory = dir
      else:
         self.directory = '.'

   #----------------------------------------------------------------------------

   def log_request_params(self, idx, params):
      of = open('%s/%04d.req_params' % (self.directory, idx), 'w')

      of.write("%s\n" % pprint.pformat(params))

      of.close()

   #----------------------------------------------------------------------------

   def log_request_header(self, idx, hdr):
      of = open('%s/%04d.req_header' % (self.directory, idx), 'w')

      of.write("%s\n" % pprint.pformat(hdr))

      of.close()

   #----------------------------------------------------------------------------

   def log_request_data(self, idx, data):
      of = open('%s/%04d.req_data' % (self.directory, idx), 'w')

      of.write("%s\n" % pprint.pformat(data))

      of.close()

   #----------------------------------------------------------------------------

   def log_response_header(self, idx, resp):
      of = open('%s/%04d.resp_header' % (self.directory, idx), 'w')

      of.write("resp.__dict__ ->\n%s\n\n" % pprint.pformat(resp.__dict__))
      of.write("Status %s  Reason [%s]\n" % (resp.status, resp.reason))
      of.write("Msg ->\n%s\n\n" % resp.msg)
      of.write("Msg.__dict__ ->\n%s\n\n" % pprint.pformat(resp.msg.__dict__))

      of.close()

   #----------------------------------------------------------------------------

   def log_response_body(self, idx, resp_body):
      of = open('%s/%04d.html' % (self.directory, idx), 'w')

      of.write(resp_body);

      of.close()

#-------------------------------------------------------------------------------

parser.py

The parser uses the BeautifulSoup module. which can be found here

#!/usr/bin/env python

import re
import pprint

#-------------------------------------------------------------------------------

from BeautifulSoup import BeautifulSoup

#-------------------------------------------------------------------------------

tr = 'tr>'

patterns = [
    [ ' height="20"', ''],
    [ ' class="TableValueBottomLeftBorder"', ''],
    [r'<img src="images\\spacer.gif" width="1" \/>', ''],
    [ '"frmRedirectionDetails"', ''],
    [ ' class="TableValueBottomLeftRightBorder"', ''],
    [r'<a href=\'javascript:OpenRedirectionDetails\("RedirectionDetails.asp\?RedirectionID=', ''],
    [r' <\/a>', ''],
    [r'", \)\'>', '|'],
    [r'   *', ''],
    [r'<\/td><td>', '|'],
    [r'<\/td><\/tr>', ''],
    [r'<tr><td>', ''],
    [r',1,', '|'],
]

#-------------------------------------------------------------------------------

def parse(fname):
   data = []

   comma = re.compile(',')

   for p in patterns:
      p.append(re.compile(p[0]))

   doc = open(fname, 'r')

   soup = BeautifulSoup(doc)

   # print len(soup('table', { "class" : "table_style"}))

   tables = soup.findAll('table')

   for i in range(len(tables)):
      if i == 3:
         trs = tables[i].findAll('tr')
         for j in range(len(trs)):
            if j > 3:
               s = str(trs[j])
               for p in patterns:
                  s = p[2].sub(p[1], s)

               if re.search(tr, s):
                  continue
               
               s = comma.sub('|', s, 1)

               data.append(s)

   return data

#-------------------------------------------------------------------------------

def test():
   results = parse('0010.html')

   if results != None:
      print results

#-------------------------------------------------------------------------------

def main(args):
   test()

#-------------------------------------------------------------------------------

if __name__ == "__main__":
   main(sys.argv[1:])


"""
     Date     Who  Description
  ----------  ---  -----------------------------------------------------
  2008-02-20  plh  Initial implementation

"""

request.py

#!/usr/bin/env python

#-------------------------------------------------------------------------------

class Request:
   Method           = 'GET'
   URL              = None
   Request_headers  = None
   Post_data        = None
   Request_params   = None
   Response_headers = None
   Response_body    = None

   #----------------------------------------------------------------------------

   def __init__(self, method, url, headers, post_data=None):
      self.Method             = method
      self.URL                = url
      self.Request_headers    = headers
      self.Post_data          = post_data 

#-------------------------------------------------------------------------------

scraper.py

#!/usr/bin/env python
#
#
#-------------------------------------------------------------------------------

import urllib
import httplib

#-------------------------------------------------------------------------------

from logger import Logger

#-------------------------------------------------------------------------------

class Scraper():
   idx        = None
   connection = None
   logger     = None

   def __init__(self, site, log_dir=None, protocol='http'):
      print "[http::scraper]  log_dir %s" % log_dir

      if (protocol == 'https'):
         self.connection = httplib.HTTPSConnection(site)
      else:
         self.connection = httplib.HTTPConnection(site)

      self.idx           = 0
      self.logger        = Logger(log_dir)

   #----------------------------------------------------------------------------

   def get_idx(self):
      return self.idx

   #----------------------------------------------------------------------------

   def request(self, r, debug=None):
      if debug: print '>>>> %s %s <<<<' % (r.Method, r.URL)

      if r.Post_data:
         r.Request_params = urllib.urlencode(r.Post_data)

         if (debug and (debug > 2)): print r.Request_params

         r.Request_headers['Content-Length'] = len(r.Request_params)

      self.connection.request(r.Method, r.URL, r.Request_params, r.Request_headers)

      resp = self.connection.getresponse()

      self.logger.log_request_header(self.idx, r.Request_headers)
      self.logger.log_response_header(self.idx, resp)

      r.Response_headers = resp

      r.Response_body = resp.read()

      self.logger.log_response_body(self.idx, r.Response_body)

      if (debug and (debug > 2)): print r.Response_body

      r.idx     = self.idx

      self.idx += 1

      return r

#-------------------------------------------------------------------------------

BeautifulSoup

http://www.crummy.com/software/BeautifulSoup/#Download

Python - httplib

Contents

Examples

Whitepages

File Download Example

Modular

action

http Module

init.py

logger.py

parser.py

request.py

scraper.py

BeautifulSoup

Navigation menu

Python - httplib

Examples

Whitepages

File Download Example

Modular

action

http Module

__init__.py

logger.py

parser.py

request.py

scraper.py

BeautifulSoup

Navigation menu

Search

init.py