Python - httplib

From PeformIQ Upgrade
Revision as of 15:01, 15 February 2008 by PeterHarding (talk | contribs)
Jump to navigation Jump to search

Examples

Whitepages

See script which page scrape search results off Whitepages site...

Parsing WhitePages Search Results HTML

File Download Example

#!/usr/bin/env python
#
#
#
#-------------------------------------------------------------------------------

import re
import sys
import urllib
import httplib
import binascii

#-------------------------------------------------------------------------------

SITE   = 'hx404'
URL    = '/CustomerPortalWeb/login.portal'

params  = urllib.urlencode({'aaa' : 1})

get_headers = {
   'Accept-Language'  : 'en-au',
   'Accept'           : 'text/plain',
   'Content-Type'     : 'text/html; charset=utf-8',
   'Connection'       : 'Keep-Alive',
   'Host'             : SITE,
   'User-Agent'       : 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)'
}

post_headers = {
   'Accept-Language'  : 'en-au',
   'Accept-Encoding'  : 'gzip, deflate',
   'Content-Type'     : 'application/x-www-form-urlencoded',
   'Host'             : SITE,
   'Connection'       : 'Keep-Alive',
   'Cache-Control'    : 'no-cache',
   'User-Agent'       : 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)'
}

   # 'Content-Length'   : len(request),
   # 'Cookie'           : 'JSESSIONID=%s' % JSESSIONID

#-------------------------------------------------------------------------------

def log_header(idx, resp):
   of = open('%04d.hdr' % idx, 'w')

   of.write("resp.__dict__ -> '%s'\n" % resp.__dict__)
   of.write("Status %s  Reason [%s]\n" % (resp.status, resp.reason))
   of.write("Msg -> '%s'\n" % resp.msg)
   of.write("Msg.__dict__ -> '%s'\n" % resp.msg.__dict__)

   #xxx =  "Msg.__dict__ -> '%s'" % resp.msg.__dict__['dict']['set-cookie']
   #print xxx

   of.close()

#-------------------------------------------------------------------------------

def log_body(idx, resp_body):
   of = open('%04d.bdy' % idx, 'w')
   of.write(resp_body);
   of.close()

#-------------------------------------------------------------------------------

def do():
   conn = httplib.HTTPConnection(SITE)

   #---------------------------------------------------------------------

   idx = 1

   print ">>>>> GET /CustomerPortalWeb/login.portal <<<<<"

   conn.request("GET", '/CustomerPortalWeb/login.portal', None, get_headers)

   resp = conn.getresponse()

   log_header(idx, resp)

   m = re.search('JSESSIONID=(.*);', resp.msg.__dict__['dict']['set-cookie'])

   if m:
      print m.group(1)
      JSESSIONID = m.group(1)

   resp_body = resp.read()

   log_body(idx, resp_body)

   print resp_body

   # <form action="http://hx404:80/CustomerPortalWeb/login/login.do;jsessionid=vgp9GDVS6JyTly0v6NfsHG0rt1pLyvpMLxYnJf9MXsk3Yn0T2SZ3!1111094026" method="post">

   #---------------------------------------------------------------------

   idx = 2

   print ">>>>> POST /CustomerPortalWeb/login/login.do <<<<<"
   print ">>>>> JSESSIONID = %s " % JSESSIONID

   URL = "/CustomerPortalWeb/login/login.do;jsessionid=%s" % JSESSIONID

#   form_data = {
#      '{actionForm.username}' : 'svtest035@svt',
#      '{actionForm.password}' : 'Welcome1'
#   }

   form_data = {
      '{actionForm.username}' : 'cpcustomeradmin',
      '{actionForm.password}' : 'August2007'
   }

   params = urllib.urlencode(form_data)

   post_headers['Content-Length'] = len(params)
   post_headers['Cookie']         = 'JSESSIONID=%s' % JSESSIONID

   conn.request("POST", URL, params, post_headers)

   resp = conn.getresponse()

   log_header(idx, resp)

   resp_body = resp.read()

   log_body(idx, resp_body)

   #---------------------------------------------------------------------

   idx = 3

   print ">>>>> GET /CustomerPortalWeb/ausPost.portal <<<<<"

   get_headers['Cookie']         = 'JSESSIONID=%s' % JSESSIONID

   conn.request("GET", '/CustomerPortalWeb/ausPost.portal', None, get_headers)

   resp = conn.getresponse()

   log_header(idx, resp)

   resp_body = resp.read()

   log_body(idx, resp_body)

   #---------------------------------------------------------------------

   idx = 4

   args    = {
                '_nfpb'      : 'true',
                '_pageLabel' : 'ImportUserPage'
             }

   ue_args = urllib.urlencode(args)

   print ">>>>> GET /CustomerPortalWeb/ausPost.portal <<<<<"

   get_headers['Cookie']         = 'JSESSIONID=%s' % JSESSIONID

   conn.request("GET", '/CustomerPortalWeb/ausPost.portal?%s' % ue_args, None, get_headers)

   resp = conn.getresponse()

   log_header(idx, resp)

   resp_body = resp.read()

   log_body(idx, resp_body)

   #---------------------------------------------------------------------

   conn.close()

#-------------------------------------------------------------------------------

def main(args):
   do()

#-------------------------------------------------------------------------------

if __name__ == "__main__":
   main(sys.argv[1:])

#-------------------------------------------------------------------------------

"""
Regex Stuff:
    regex          = re.compile("\\n *")
       (name, cnt) = re.subn('esb:', '', node_name)
             value = re.sub(r'\n *', 'N/A', value)
"""

"""
  FILE DOWNLOAD:

    h.putrequest('POST', '/scripts/cgi.exe?')
    h.putheader('Content-length', '%d'%len(params))
    h.putheader('Accept', 'text/plain')
    h.putheader('Host', 'test.site.com')
    h.endheaders()
    h.send(params)
    reply, msg, hdrs = h.getreply()
    data = h.getfile().read()
    file('test.file', 'w').write(data)
    h.close()
"""

"""
Accept-Language: en-au
Content-Type: application/x-www-form-urlencoded
Accept-Encoding: gzip, deflate
User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)
Host: hx414:6304
Content-Length: 54
Connection: Keep-Alive
Cache-Control: no-cache
Cookie: JSESSIONID=jpm7G5hJbx6pYdhTr3GRRQrXsknrFcxdF7VdhcVPctThHdQxJjsC!2061771890
"""