Difference between revisions of "Python - httplib"

From PeformIQ Upgrade
Jump to navigation Jump to search
 
(11 intermediate revisions by the same user not shown)
Line 1: Line 1:
__TOC__
=Examples=
=Examples=


Line 7: Line 9:
[[Parsing WhitePages Search Results HTML]]
[[Parsing WhitePages Search Results HTML]]


==File Download Example==


<pre>
<pre>
#!/usr/bin/env python
#
#
#
#-------------------------------------------------------------------------------
import re
import sys
import urllib
import httplib
import binascii
#-------------------------------------------------------------------------------
SITE  = 'host'
URL    = '/url/login'
params  = urllib.urlencode({'aaa' : 1})
get_headers = {
  'Accept-Language'  : 'en-au',
  'Accept'          : 'text/plain',
  'Content-Type'    : 'text/html; charset=utf-8',
  'Connection'      : 'Keep-Alive',
  'Host'            : SITE,
  'User-Agent'      : 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)'
}
post_headers = {
  'Accept-Language'  : 'en-au',
  'Accept-Encoding'  : 'gzip, deflate',
  'Content-Type'    : 'application/x-www-form-urlencoded',
  'Host'            : SITE,
  'Connection'      : 'Keep-Alive',
  'Cache-Control'    : 'no-cache',
  'User-Agent'      : 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)'
}
  # 'Content-Length'  : len(request),
  # 'Cookie'          : 'JSESSIONID=%s' % JSESSIONID
#-------------------------------------------------------------------------------
def log_header(idx, resp):
  of = open('%04d.hdr' % idx, 'w')
  of.write("resp.__dict__ -> '%s'\n" % resp.__dict__)
  of.write("Status %s  Reason [%s]\n" % (resp.status, resp.reason))
  of.write("Msg -> '%s'\n" % resp.msg)
  of.write("Msg.__dict__ -> '%s'\n" % resp.msg.__dict__)
  #xxx =  "Msg.__dict__ -> '%s'" % resp.msg.__dict__['dict']['set-cookie']
  #print xxx
  of.close()
#-------------------------------------------------------------------------------
def log_body(idx, resp_body):
  of = open('%04d.bdy' % idx, 'w')
  of.write(resp_body);
  of.close()
#-------------------------------------------------------------------------------
def do():
  conn = httplib.HTTPConnection(SITE)
  #---------------------------------------------------------------------
  idx = 1
  print ">>>>> GET %s <<<<<" % URL
  conn.request("GET", '/%s/login.do' % URL, None, get_headers)
  resp = conn.getresponse()
  log_header(idx, resp)
  m = re.search('JSESSIONID=(.*);', resp.msg.__dict__['dict']['set-cookie'])
  if m:
      print m.group(1)
      JSESSIONID = m.group(1)
  resp_body = resp.read()
  log_body(idx, resp_body)
  print resp_body
  # <form action="http://xxxx:80/CustomerPortalWeb/login/login.do;jsessionid=vgp9GDVS6JyTly0v6NfsHG0rt1pLyvpMLxYnJf9MXsk3Yn0T2SZ3!1111094026" method="post">
  #---------------------------------------------------------------------
  idx = 2
  print ">>>>> POST /%s/login.do <<<<<" % URL
  print ">>>>> JSESSIONID = %s " % JSESSIONID
  URL = "/CustomerPortalWeb/login/login.do;jsessionid=%s" % JSESSIONID
#  form_data = {
#      '{actionForm.username}' : 'svtest035@svt',
#      '{actionForm.password}' : 'xxxx'
#  }
  form_data = {
      '{actionForm.username}' : 'admin',
      '{actionForm.password}' : 'xxxx'
  }
  params = urllib.urlencode(form_data)
  post_headers['Content-Length'] = len(params)
  post_headers['Cookie']        = 'JSESSIONID=%s' % JSESSIONID
  conn.request("POST", URL, params, post_headers)
  resp = conn.getresponse()
  log_header(idx, resp)
  resp_body = resp.read()
  log_body(idx, resp_body)
  #---------------------------------------------------------------------
  idx = 3
  print ">>>>> GET /%s/test.do <<<<<" % URL
  get_headers['Cookie']        = 'JSESSIONID=%s' % JSESSIONID
  conn.request("GET", '/%s/test.do' % URL, None, get_headers)
  resp = conn.getresponse()
  log_header(idx, resp)
  resp_body = resp.read()
  log_body(idx, resp_body)
  #---------------------------------------------------------------------
  idx = 4
  args    = {
                '_nfpb'      : 'true',
                '_pageLabel' : 'ImportUserPage'
            }
  ue_args = urllib.urlencode(args)
  print ">>>>> GET /%s/test.do <<<<<" % URL
  get_headers['Cookie']        = 'JSESSIONID=%s' % JSESSIONID
  conn.request("GET", '/%s/test.do?%s' % (URL, ue_args), None, get_headers)
  resp = conn.getresponse()
  log_header(idx, resp)
  resp_body = resp.read()
  log_body(idx, resp_body)
  #---------------------------------------------------------------------
  conn.close()
#-------------------------------------------------------------------------------
def main(args):
  do()
#-------------------------------------------------------------------------------
if __name__ == "__main__":
  main(sys.argv[1:])
#-------------------------------------------------------------------------------
"""
Regex Stuff:
    regex          = re.compile("\\n *")
      (name, cnt) = re.subn('esb:', '', node_name)
            value = re.sub(r'\n *', 'N/A', value)
"""
"""
  FILE DOWNLOAD:


    h.putrequest('POST', '/scripts/cgi.exe?')
    h.putheader('Content-length', '%d'%len(params))
    h.putheader('Accept', 'text/plain')
    h.putheader('Host', 'test.site.com')
    h.endheaders()
    h.send(params)
    reply, msg, hdrs = h.getreply()
    data = h.getfile().read()
    file('test.file', 'w').write(data)
    h.close()
"""
"""
Accept-Language: en-au
Content-Type: application/x-www-form-urlencoded
Accept-Encoding: gzip, deflate
User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)
Host: xxx:8080
Content-Length: 54
Connection: Keep-Alive
Cache-Control: no-cache
Cookie: JSESSIONID=xxx
"""
</pre>
</pre>
=Modular=
See [http://www.performiq.com.au/kb/images/Action.zip here] for zipped up copy of files.
==action==
<pre>
#!/usr/bin/env python
#
#
#-------------------------------------------------------------------------------
import re
import sys
#-------------------------------------------------------------------------------
import http
#-------------------------------------------------------------------------------
SITE        = 'host'
site        = None
ASPSESSIONID = None
postcodes    = [2000, 2001, 2007, 2010, 3000, 3001, 3162, 3124, 3005, 6009, 7001]
#-------------------------------------------------------------------------------
def login():
  global ASPSESSIONID
  #------------------------------------------------------------------------
  DO  = 'GET'
  URL = '/'
  headers = http.GET_Headers(SITE)
  r = http.Request(DO, URL, headers)
  site.request(r)
  #------------------------------------------------------------------------
  DO  = 'GET'
  URL = '/login.asp'
  headers = http.GET_Headers(SITE)
  headers['Referer']        = 'http://%s/' % SITE
  r = http.Request(DO, URL, headers)
  site.request(r)
  m = re.search('ASPSESSIONIDSCBCSRCC=(.*);', r.Response_headers.msg.__dict__['dict']['set-cookie'])
  if m:
      print "ASPSESSIONID --> %s" % m.group(1)
      ASPSESSIONID = m.group(1)
  #---------------------------------------------------------------------
  DO  = 'POST'
  URL = '/Login.asp'
  headers = http.POST_Headers(SITE)
  headers['Referer']        = 'http://%s/login.asp' % SITE
  headers['Cookie']        = 'ASPSESSIONIDSCBCSRCC=%s' % ASPSESSIONID
  post_data = {
      'txtUserName' : 'xxx',
      'txtPassword' : 'xxx',
      'txtUserID'  : '0',
      'imgOK.x'    : '32',
      'imgOK.y'    : '11'
  }
  r = http.Request(DO, URL, headers, post_data)
  site.request(r)
  #---------------------------------------------------------------------
  DO  = 'POST'
  URL = '/Login.asp'
  headers = http.POST_Headers(SITE)
  headers['Referer']        = 'http://%s/login.asp' % SITE
  headers['Cookie']        = 'ASPSESSIONIDSCBCSRCC=%s' % ASPSESSIONID
  form_data = {
      'txtUserID'  : '1142',
      'imgOK.x'    : '31',
      'imgOK.y'    : '12'
  }
  r = http.Request(DO, URL, headers, post_data)
  site.request(r)
  #---------------------------------------------------------------------
  DO  = 'POST'
  URL = '/Login.asp'
  headers = http.POST_Headers(SITE)
  headers['Referer']        = 'http://%s/Login.asp' % SITE
  headers['Cookie']        = 'ASPSESSIONIDSCBCSRCC=%s' % ASPSESSIONID
  form_data = {
      'cmbDC'      : '311001%2CMETRO MAILS BUS UNIT CAP INV  ',
      'txtUserID'  : '1142',
      'imgOK2.x'    : '20',
      'imgOK2.y'    : '5'
  }
  r = http.Request(DO, URL, headers, post_data)
  site.request(r)
  #---------------------------------------------------------------------
  DO  = 'GET'
  URL = '/menuindex.asp?UserID=xxx,xxx'
  headers = http.GET_Headers(SITE)
  headers['Cookie']        = 'ASPSESSIONIDSCBCSRCC=%s' % ASPSESSIONID
  r = http.Request(DO, URL, headers)
  site.request(r)
 
  #---------------------------------------------------------------------
  DO  = 'POST'
  URL = '/Redirect.asp?UserID=xxx,xxx'
  headers = http.POST_Headers(SITE)
  headers['Referer']        = 'http://%s/Login.asp' % SITE
  headers['Cookie']        = 'ASPSESSIONIDSCBCSRCC=%s' % ASPSESSIONID
  r = http.Request(DO, URL, headers, post_data)
  site.request(r)
#------------------------------------------------------------------------
def scrape(postcode):
  DO  = 'POST'
  URL = '/Redirect.asp'
  print 'Scraping %d' % postcode
  headers = http.POST_Headers(SITE)
  headers['Referer']        = 'http://%s/Login.asp' % SITE
  headers['Cookie']        = 'ASPSESSIONIDSCBCSRCC=%s' % ASPSESSIONID
  post_data = {
      'txtCurrentUserID'      : 'xxx',
      'txtDC'                : 'xxx',
      'txtCRN'                : '',
      'txtLastName'          : '',
      'cmbSearchTypeLastName' : '2',
      'txtAddress1'          : '',
      'cmbSearchTypeAddress1' : '2',
      'txtAddress2'          : '',
      'cmbSearchTypeAddress2' : '3',
      'txtSuburb'            : '',
      'cmbSearchTypeSuburb'  : '1',
      'txtPostcode'          : '%4d' % postcode,
      'cmbAction'            : '1',
      'imgSearch.x'          : '24',
      'imgSearch.y'          : '6'
  }
  r = http.Request(DO, URL, headers, post_data)
  site.request(r)
  return r.idx
#------------------------------------------------------------------------
def process():
  global site
  dir = 'log'
  site = http.scraper.Scraper(SITE, log_dir=dir)
  login()
  data = ''
  ofh = open('redirections.dat', 'a+')
  for postcode in postcodes:
      idx = scrape(postcode)
      rows = http.parse('%s/%04d.html' % (dir, idx))
      for row in rows:
        ofh.write("%s|%s\n" % (row, postcode))
  ofh.close()
#------------------------------------------------------------------------
def main(args):
  process()
#-------------------------------------------------------------------------------
if __name__ == "__main__":
  main(sys.argv[1:])
"""
    Date    Who  Description
  ----------  ---  -----------------------------------------------------
  2008-02-20  plh  Initial implementation
"""
</pre>
==http Module==
===__init__.py===
<pre>
from request import Request
import logger
import scraper
from parser import parse
#-------------------------------------------------------------------------------
#    'Accept' : 'text/plain, text/html',
def GET_Headers(site):
  return {
      'Accept-Encoding'    : 'gzip, deflate',
      'Accept'            : '*/*',
      'Accept-Language'    : 'en-au',
      'Host'              : '%s' % site,
      'Connection'        : 'Keep-Alive',
      'User-Agent'        :  'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727'
  }
#-------------------------------------------------------------------------------
def POST_Headers(site):
  return  {
      'Accept:'            : 'image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, application/x-shockwave-flash, */*',
      'Accept-Language:'  : 'en-au',
      'Content-Type:'      : 'application/x-www-form-urlencoded',
      'Accept-Encoding:'  : 'gzip, deflate',
      'User-Agent:'        : 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
      'Host'              : '%s',
      'Connection:'        : 'Keep-Alive',
      'Cache-Control:'    : 'no-cache',
  }
#-------------------------------------------------------------------------------
</pre>
===logger.py===
<pre>
#!/usr/bin/env python
#
#
#-------------------------------------------------------------------------------
import pprint
#-------------------------------------------------------------------------------
class Logger():
  directory = None
  #----------------------------------------------------------------------------
  def __init__(self, dir=None):
      print "[http::scraper]  dir %s" % dir
      if dir:
        self.directory = dir
      else:
        self.directory = '.'
  #----------------------------------------------------------------------------
  def log_request_params(self, idx, params):
      of = open('%s/%04d.req_params' % (self.directory, idx), 'w')
      of.write("%s\n" % pprint.pformat(params))
      of.close()
  #----------------------------------------------------------------------------
  def log_request_header(self, idx, hdr):
      of = open('%s/%04d.req_header' % (self.directory, idx), 'w')
      of.write("%s\n" % pprint.pformat(hdr))
      of.close()
  #----------------------------------------------------------------------------
  def log_request_data(self, idx, data):
      of = open('%s/%04d.req_data' % (self.directory, idx), 'w')
      of.write("%s\n" % pprint.pformat(data))
      of.close()
  #----------------------------------------------------------------------------
  def log_response_header(self, idx, resp):
      of = open('%s/%04d.resp_header' % (self.directory, idx), 'w')
      of.write("resp.__dict__ ->\n%s\n\n" % pprint.pformat(resp.__dict__))
      of.write("Status %s  Reason [%s]\n" % (resp.status, resp.reason))
      of.write("Msg ->\n%s\n\n" % resp.msg)
      of.write("Msg.__dict__ ->\n%s\n\n" % pprint.pformat(resp.msg.__dict__))
      of.close()
  #----------------------------------------------------------------------------
  def log_response_body(self, idx, resp_body):
      of = open('%s/%04d.html' % (self.directory, idx), 'w')
      of.write(resp_body);
      of.close()
#-------------------------------------------------------------------------------
</pre>
===parser.py===
The parser uses the BeautifulSoup module. which can be found [http://www.crummy.com/software/BeautifulSoup/#Download here]
<pre>
#!/usr/bin/env python
import re
import pprint
#-------------------------------------------------------------------------------
from BeautifulSoup import BeautifulSoup
#-------------------------------------------------------------------------------
tr = 'tr>'
patterns = [
    [ ' height="20"', ''],
    [ ' class="TableValueBottomLeftBorder"', ''],
    [r'<img src="images\\spacer.gif" width="1" \/>', ''],
    [ '"frmRedirectionDetails"', ''],
    [ ' class="TableValueBottomLeftRightBorder"', ''],
    [r'<a href=\'javascript:OpenRedirectionDetails\("RedirectionDetails.asp\?RedirectionID=', ''],
    [r' <\/a>', ''],
    [r'", \)\'>', '|'],
    [r'  *', ''],
    [r'<\/td><td>', '|'],
    [r'<\/td><\/tr>', ''],
    [r'<tr><td>', ''],
    [r',1,', '|'],
]
#-------------------------------------------------------------------------------
def parse(fname):
  data = []
  comma = re.compile(',')
  for p in patterns:
      p.append(re.compile(p[0]))
  doc = open(fname, 'r')
  soup = BeautifulSoup(doc)
  # print len(soup('table', { "class" : "table_style"}))
  tables = soup.findAll('table')
  for i in range(len(tables)):
      if i == 3:
        trs = tables[i].findAll('tr')
        for j in range(len(trs)):
            if j > 3:
              s = str(trs[j])
              for p in patterns:
                  s = p[2].sub(p[1], s)
              if re.search(tr, s):
                  continue
             
              s = comma.sub('|', s, 1)
              data.append(s)
  return data
#-------------------------------------------------------------------------------
def test():
  results = parse('0010.html')
  if results != None:
      print results
#-------------------------------------------------------------------------------
def main(args):
  test()
#-------------------------------------------------------------------------------
if __name__ == "__main__":
  main(sys.argv[1:])
"""
    Date    Who  Description
  ----------  ---  -----------------------------------------------------
  2008-02-20  plh  Initial implementation
"""
</pre>
===request.py===
<pre>
#!/usr/bin/env python
#-------------------------------------------------------------------------------
class Request:
  Method          = 'GET'
  URL              = None
  Request_headers  = None
  Post_data        = None
  Request_params  = None
  Response_headers = None
  Response_body    = None
  #----------------------------------------------------------------------------
  def __init__(self, method, url, headers, post_data=None):
      self.Method            = method
      self.URL                = url
      self.Request_headers    = headers
      self.Post_data          = post_data
#-------------------------------------------------------------------------------
</pre>
===scraper.py===
<pre>
#!/usr/bin/env python
#
#
#-------------------------------------------------------------------------------
import urllib
import httplib
#-------------------------------------------------------------------------------
from logger import Logger
#-------------------------------------------------------------------------------
class Scraper():
  idx        = None
  connection = None
  logger    = None
  def __init__(self, site, log_dir=None, protocol='http'):
      print "[http::scraper]  log_dir %s" % log_dir
      if (protocol == 'https'):
        self.connection = httplib.HTTPSConnection(site)
      else:
        self.connection = httplib.HTTPConnection(site)
      self.idx          = 0
      self.logger        = Logger(log_dir)
  #----------------------------------------------------------------------------
  def get_idx(self):
      return self.idx
  #----------------------------------------------------------------------------
  def request(self, r, debug=None):
      if debug: print '>>>> %s %s <<<<' % (r.Method, r.URL)
      if r.Post_data:
        r.Request_params = urllib.urlencode(r.Post_data)
        if (debug and (debug > 2)): print r.Request_params
        r.Request_headers['Content-Length'] = len(r.Request_params)
      self.connection.request(r.Method, r.URL, r.Request_params, r.Request_headers)
      resp = self.connection.getresponse()
      self.logger.log_request_header(self.idx, r.Request_headers)
      self.logger.log_response_header(self.idx, resp)
      r.Response_headers = resp
      r.Response_body = resp.read()
      self.logger.log_response_body(self.idx, r.Response_body)
      if (debug and (debug > 2)): print r.Response_body
      r.idx    = self.idx
      self.idx += 1
      return r
#-------------------------------------------------------------------------------
</pre>
=BeautifulSoup=
* http://www.crummy.com/software/BeautifulSoup/#Download
[[Category:Python]]
[[Category:Python httplib]]

Latest revision as of 15:08, 1 August 2015

Examples

Whitepages

See script which page scrape search results off Whitepages site...

Parsing WhitePages Search Results HTML

File Download Example

#!/usr/bin/env python
#
#
#
#-------------------------------------------------------------------------------

import re
import sys
import urllib
import httplib
import binascii

#-------------------------------------------------------------------------------

SITE   = 'host'
URL    = '/url/login'

params  = urllib.urlencode({'aaa' : 1})

get_headers = {
   'Accept-Language'  : 'en-au',
   'Accept'           : 'text/plain',
   'Content-Type'     : 'text/html; charset=utf-8',
   'Connection'       : 'Keep-Alive',
   'Host'             : SITE,
   'User-Agent'       : 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)'
}

post_headers = {
   'Accept-Language'  : 'en-au',
   'Accept-Encoding'  : 'gzip, deflate',
   'Content-Type'     : 'application/x-www-form-urlencoded',
   'Host'             : SITE,
   'Connection'       : 'Keep-Alive',
   'Cache-Control'    : 'no-cache',
   'User-Agent'       : 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)'
}

   # 'Content-Length'   : len(request),
   # 'Cookie'           : 'JSESSIONID=%s' % JSESSIONID

#-------------------------------------------------------------------------------

def log_header(idx, resp):
   of = open('%04d.hdr' % idx, 'w')

   of.write("resp.__dict__ -> '%s'\n" % resp.__dict__)
   of.write("Status %s  Reason [%s]\n" % (resp.status, resp.reason))
   of.write("Msg -> '%s'\n" % resp.msg)
   of.write("Msg.__dict__ -> '%s'\n" % resp.msg.__dict__)

   #xxx =  "Msg.__dict__ -> '%s'" % resp.msg.__dict__['dict']['set-cookie']
   #print xxx

   of.close()

#-------------------------------------------------------------------------------

def log_body(idx, resp_body):
   of = open('%04d.bdy' % idx, 'w')
   of.write(resp_body);
   of.close()

#-------------------------------------------------------------------------------

def do():
   conn = httplib.HTTPConnection(SITE)

   #---------------------------------------------------------------------

   idx = 1

   print ">>>>> GET %s <<<<<" % URL

   conn.request("GET", '/%s/login.do' % URL, None, get_headers)

   resp = conn.getresponse()

   log_header(idx, resp)

   m = re.search('JSESSIONID=(.*);', resp.msg.__dict__['dict']['set-cookie'])

   if m:
      print m.group(1)
      JSESSIONID = m.group(1)

   resp_body = resp.read()

   log_body(idx, resp_body)

   print resp_body

   # <form action="http://xxxx:80/CustomerPortalWeb/login/login.do;jsessionid=vgp9GDVS6JyTly0v6NfsHG0rt1pLyvpMLxYnJf9MXsk3Yn0T2SZ3!1111094026" method="post">

   #---------------------------------------------------------------------

   idx = 2

   print ">>>>> POST /%s/login.do <<<<<" % URL
   print ">>>>> JSESSIONID = %s " % JSESSIONID

   URL = "/CustomerPortalWeb/login/login.do;jsessionid=%s" % JSESSIONID

#   form_data = {
#      '{actionForm.username}' : 'svtest035@svt',
#      '{actionForm.password}' : 'xxxx'
#   }

   form_data = {
      '{actionForm.username}' : 'admin',
      '{actionForm.password}' : 'xxxx'
   }

   params = urllib.urlencode(form_data)

   post_headers['Content-Length'] = len(params)
   post_headers['Cookie']         = 'JSESSIONID=%s' % JSESSIONID

   conn.request("POST", URL, params, post_headers)

   resp = conn.getresponse()

   log_header(idx, resp)

   resp_body = resp.read()

   log_body(idx, resp_body)

   #---------------------------------------------------------------------

   idx = 3

   print ">>>>> GET /%s/test.do <<<<<" % URL

   get_headers['Cookie']         = 'JSESSIONID=%s' % JSESSIONID

   conn.request("GET", '/%s/test.do' % URL, None, get_headers)

   resp = conn.getresponse()

   log_header(idx, resp)

   resp_body = resp.read()

   log_body(idx, resp_body)

   #---------------------------------------------------------------------

   idx = 4

   args    = {
                '_nfpb'      : 'true',
                '_pageLabel' : 'ImportUserPage'
             }

   ue_args = urllib.urlencode(args)

   print ">>>>> GET /%s/test.do <<<<<" % URL

   get_headers['Cookie']         = 'JSESSIONID=%s' % JSESSIONID

   conn.request("GET", '/%s/test.do?%s' % (URL, ue_args), None, get_headers)

   resp = conn.getresponse()

   log_header(idx, resp)

   resp_body = resp.read()

   log_body(idx, resp_body)

   #---------------------------------------------------------------------

   conn.close()

#-------------------------------------------------------------------------------

def main(args):
   do()

#-------------------------------------------------------------------------------

if __name__ == "__main__":
   main(sys.argv[1:])

#-------------------------------------------------------------------------------

"""
Regex Stuff:
    regex          = re.compile("\\n *")
       (name, cnt) = re.subn('esb:', '', node_name)
             value = re.sub(r'\n *', 'N/A', value)
"""

"""
  FILE DOWNLOAD:

    h.putrequest('POST', '/scripts/cgi.exe?')
    h.putheader('Content-length', '%d'%len(params))
    h.putheader('Accept', 'text/plain')
    h.putheader('Host', 'test.site.com')
    h.endheaders()
    h.send(params)
    reply, msg, hdrs = h.getreply()
    data = h.getfile().read()
    file('test.file', 'w').write(data)
    h.close()
"""

"""
Accept-Language: en-au
Content-Type: application/x-www-form-urlencoded
Accept-Encoding: gzip, deflate
User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)
Host: xxx:8080
Content-Length: 54
Connection: Keep-Alive
Cache-Control: no-cache
Cookie: JSESSIONID=xxx
"""

Modular

See here for zipped up copy of files.

action

#!/usr/bin/env python
#
#
#-------------------------------------------------------------------------------

import re
import sys

#-------------------------------------------------------------------------------

import http

#-------------------------------------------------------------------------------

SITE         = 'host'

site         = None
ASPSESSIONID = None

postcodes    = [2000, 2001, 2007, 2010, 3000, 3001, 3162, 3124, 3005, 6009, 7001]

#-------------------------------------------------------------------------------

def login():
   global ASPSESSIONID

   #------------------------------------------------------------------------

   DO  = 'GET'
   URL = '/'

   headers = http.GET_Headers(SITE)

   r = http.Request(DO, URL, headers)

   site.request(r)

   #------------------------------------------------------------------------

   DO  = 'GET'
   URL = '/login.asp'

   headers = http.GET_Headers(SITE)

   headers['Referer']        = 'http://%s/' % SITE

   r = http.Request(DO, URL, headers)

   site.request(r)

   m = re.search('ASPSESSIONIDSCBCSRCC=(.*);', r.Response_headers.msg.__dict__['dict']['set-cookie'])

   if m:
      print "ASPSESSIONID --> %s" % m.group(1)
      ASPSESSIONID = m.group(1)

   #---------------------------------------------------------------------

   DO  = 'POST'
   URL = '/Login.asp'

   headers = http.POST_Headers(SITE)

   headers['Referer']        = 'http://%s/login.asp' % SITE
   headers['Cookie']         = 'ASPSESSIONIDSCBCSRCC=%s' % ASPSESSIONID

   post_data = {
      'txtUserName' : 'xxx',
      'txtPassword' : 'xxx',
      'txtUserID'   : '0',
      'imgOK.x'     : '32',
      'imgOK.y'     : '11'
   }

   r = http.Request(DO, URL, headers, post_data)

   site.request(r)

   #---------------------------------------------------------------------

   DO  = 'POST'
   URL = '/Login.asp'

   headers = http.POST_Headers(SITE)

   headers['Referer']        = 'http://%s/login.asp' % SITE
   headers['Cookie']         = 'ASPSESSIONIDSCBCSRCC=%s' % ASPSESSIONID

   form_data = {
      'txtUserID'   : '1142',
      'imgOK.x'     : '31',
      'imgOK.y'     : '12'
   }

   r = http.Request(DO, URL, headers, post_data)

   site.request(r)

   #---------------------------------------------------------------------

   DO  = 'POST'
   URL = '/Login.asp'

   headers = http.POST_Headers(SITE)

   headers['Referer']        = 'http://%s/Login.asp' % SITE
   headers['Cookie']         = 'ASPSESSIONIDSCBCSRCC=%s' % ASPSESSIONID

   form_data = {
      'cmbDC'       : '311001%2CMETRO MAILS BUS UNIT CAP INV  ',
      'txtUserID'   : '1142',
      'imgOK2.x'     : '20',
      'imgOK2.y'     : '5'
   }

   r = http.Request(DO, URL, headers, post_data)

   site.request(r)

   #---------------------------------------------------------------------

   DO  = 'GET'
   URL = '/menuindex.asp?UserID=xxx,xxx'

   headers = http.GET_Headers(SITE)

   headers['Cookie']         = 'ASPSESSIONIDSCBCSRCC=%s' % ASPSESSIONID

   r = http.Request(DO, URL, headers)

   site.request(r)
   
   #---------------------------------------------------------------------

   DO  = 'POST'
   URL = '/Redirect.asp?UserID=xxx,xxx'

   headers = http.POST_Headers(SITE)

   headers['Referer']        = 'http://%s/Login.asp' % SITE
   headers['Cookie']         = 'ASPSESSIONIDSCBCSRCC=%s' % ASPSESSIONID

   r = http.Request(DO, URL, headers, post_data)

   site.request(r)

#------------------------------------------------------------------------

def scrape(postcode):
   DO  = 'POST'
   URL = '/Redirect.asp'

   print 'Scraping %d' % postcode

   headers = http.POST_Headers(SITE)

   headers['Referer']        = 'http://%s/Login.asp' % SITE
   headers['Cookie']         = 'ASPSESSIONIDSCBCSRCC=%s' % ASPSESSIONID

   post_data = {
      'txtCurrentUserID'      : 'xxx',
      'txtDC'                 : 'xxx',
      'txtCRN'                : '',
      'txtLastName'           : '',
      'cmbSearchTypeLastName' : '2',
      'txtAddress1'           : '',
      'cmbSearchTypeAddress1' : '2',
      'txtAddress2'           : '',
      'cmbSearchTypeAddress2' : '3',
      'txtSuburb'             : '',
      'cmbSearchTypeSuburb'   : '1',
      'txtPostcode'           : '%4d' % postcode,
      'cmbAction'             : '1',
      'imgSearch.x'           : '24',
      'imgSearch.y'           : '6'
   }

   r = http.Request(DO, URL, headers, post_data)

   site.request(r)

   return r.idx

#------------------------------------------------------------------------

def process():
   global site

   dir = 'log'

   site = http.scraper.Scraper(SITE, log_dir=dir)

   login()

   data = ''

   ofh = open('redirections.dat', 'a+')

   for postcode in postcodes:
      idx = scrape(postcode)

      rows = http.parse('%s/%04d.html' % (dir, idx))

      for row in rows:
         ofh.write("%s|%s\n" % (row, postcode))

   ofh.close()

#------------------------------------------------------------------------

def main(args):
   process()

#-------------------------------------------------------------------------------

if __name__ == "__main__":
   main(sys.argv[1:])


"""
     Date     Who  Description
  ----------  ---  -----------------------------------------------------
  2008-02-20  plh  Initial implementation

"""

http Module

__init__.py


from request import Request

import logger
import scraper

from parser import parse

#-------------------------------------------------------------------------------
#    'Accept' : 'text/plain, text/html',

def GET_Headers(site):
   return {
      'Accept-Encoding'    : 'gzip, deflate',
      'Accept'             : '*/*',
      'Accept-Language'    : 'en-au',
      'Host'               : '%s' % site,
      'Connection'         : 'Keep-Alive',
      'User-Agent'         :  'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727'
   }

#-------------------------------------------------------------------------------

def POST_Headers(site):
   return  {
      'Accept:'            : 'image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, application/x-shockwave-flash, */*',
      'Accept-Language:'   : 'en-au',
      'Content-Type:'      : 'application/x-www-form-urlencoded',
      'Accept-Encoding:'   : 'gzip, deflate',
      'User-Agent:'        : 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
      'Host'               : '%s',
      'Connection:'        : 'Keep-Alive',
      'Cache-Control:'     : 'no-cache',
   }

#-------------------------------------------------------------------------------

logger.py

#!/usr/bin/env python
#
#
#-------------------------------------------------------------------------------

import pprint

#-------------------------------------------------------------------------------

class Logger():
   directory = None

   #----------------------------------------------------------------------------

   def __init__(self, dir=None):
      print "[http::scraper]  dir %s" % dir

      if dir:
         self.directory = dir
      else:
         self.directory = '.'

   #----------------------------------------------------------------------------

   def log_request_params(self, idx, params):
      of = open('%s/%04d.req_params' % (self.directory, idx), 'w')

      of.write("%s\n" % pprint.pformat(params))

      of.close()

   #----------------------------------------------------------------------------

   def log_request_header(self, idx, hdr):
      of = open('%s/%04d.req_header' % (self.directory, idx), 'w')

      of.write("%s\n" % pprint.pformat(hdr))

      of.close()

   #----------------------------------------------------------------------------

   def log_request_data(self, idx, data):
      of = open('%s/%04d.req_data' % (self.directory, idx), 'w')

      of.write("%s\n" % pprint.pformat(data))

      of.close()

   #----------------------------------------------------------------------------

   def log_response_header(self, idx, resp):
      of = open('%s/%04d.resp_header' % (self.directory, idx), 'w')

      of.write("resp.__dict__ ->\n%s\n\n" % pprint.pformat(resp.__dict__))
      of.write("Status %s  Reason [%s]\n" % (resp.status, resp.reason))
      of.write("Msg ->\n%s\n\n" % resp.msg)
      of.write("Msg.__dict__ ->\n%s\n\n" % pprint.pformat(resp.msg.__dict__))

      of.close()

   #----------------------------------------------------------------------------

   def log_response_body(self, idx, resp_body):
      of = open('%s/%04d.html' % (self.directory, idx), 'w')

      of.write(resp_body);

      of.close()

#-------------------------------------------------------------------------------

parser.py

The parser uses the BeautifulSoup module. which can be found here

#!/usr/bin/env python

import re
import pprint

#-------------------------------------------------------------------------------

from BeautifulSoup import BeautifulSoup

#-------------------------------------------------------------------------------

tr = 'tr>'

patterns = [
    [ ' height="20"', ''],
    [ ' class="TableValueBottomLeftBorder"', ''],
    [r'<img src="images\\spacer.gif" width="1" \/>', ''],
    [ '"frmRedirectionDetails"', ''],
    [ ' class="TableValueBottomLeftRightBorder"', ''],
    [r'<a href=\'javascript:OpenRedirectionDetails\("RedirectionDetails.asp\?RedirectionID=', ''],
    [r' <\/a>', ''],
    [r'", \)\'>', '|'],
    [r'   *', ''],
    [r'<\/td><td>', '|'],
    [r'<\/td><\/tr>', ''],
    [r'<tr><td>', ''],
    [r',1,', '|'],
]

#-------------------------------------------------------------------------------

def parse(fname):
   data = []

   comma = re.compile(',')

   for p in patterns:
      p.append(re.compile(p[0]))

   doc = open(fname, 'r')

   soup = BeautifulSoup(doc)

   # print len(soup('table', { "class" : "table_style"}))

   tables = soup.findAll('table')

   for i in range(len(tables)):
      if i == 3:
         trs = tables[i].findAll('tr')
         for j in range(len(trs)):
            if j > 3:
               s = str(trs[j])
               for p in patterns:
                  s = p[2].sub(p[1], s)

               if re.search(tr, s):
                  continue
               
               s = comma.sub('|', s, 1)

               data.append(s)

   return data

#-------------------------------------------------------------------------------

def test():
   results = parse('0010.html')

   if results != None:
      print results

#-------------------------------------------------------------------------------

def main(args):
   test()

#-------------------------------------------------------------------------------

if __name__ == "__main__":
   main(sys.argv[1:])


"""
     Date     Who  Description
  ----------  ---  -----------------------------------------------------
  2008-02-20  plh  Initial implementation

"""

request.py

#!/usr/bin/env python

#-------------------------------------------------------------------------------

class Request:
   Method           = 'GET'
   URL              = None
   Request_headers  = None
   Post_data        = None
   Request_params   = None
   Response_headers = None
   Response_body    = None

   #----------------------------------------------------------------------------

   def __init__(self, method, url, headers, post_data=None):
      self.Method             = method
      self.URL                = url
      self.Request_headers    = headers
      self.Post_data          = post_data 

#-------------------------------------------------------------------------------


scraper.py

#!/usr/bin/env python
#
#
#-------------------------------------------------------------------------------

import urllib
import httplib

#-------------------------------------------------------------------------------

from logger import Logger

#-------------------------------------------------------------------------------

class Scraper():
   idx        = None
   connection = None
   logger     = None

   def __init__(self, site, log_dir=None, protocol='http'):
      print "[http::scraper]  log_dir %s" % log_dir

      if (protocol == 'https'):
         self.connection = httplib.HTTPSConnection(site)
      else:
         self.connection = httplib.HTTPConnection(site)

      self.idx           = 0
      self.logger        = Logger(log_dir)

   #----------------------------------------------------------------------------

   def get_idx(self):
      return self.idx

   #----------------------------------------------------------------------------

   def request(self, r, debug=None):
      if debug: print '>>>> %s %s <<<<' % (r.Method, r.URL)

      if r.Post_data:
         r.Request_params = urllib.urlencode(r.Post_data)

         if (debug and (debug > 2)): print r.Request_params

         r.Request_headers['Content-Length'] = len(r.Request_params)

      self.connection.request(r.Method, r.URL, r.Request_params, r.Request_headers)

      resp = self.connection.getresponse()

      self.logger.log_request_header(self.idx, r.Request_headers)
      self.logger.log_response_header(self.idx, resp)

      r.Response_headers = resp

      r.Response_body = resp.read()

      self.logger.log_response_body(self.idx, r.Response_body)

      if (debug and (debug > 2)): print r.Response_body

      r.idx     = self.idx

      self.idx += 1

      return r

#-------------------------------------------------------------------------------

BeautifulSoup