Python - httplib
Jump to navigation
Jump to search
Examples
Whitepages
See script which page scrape search results off Whitepages site...
Parsing WhitePages Search Results HTML
File Download Example
#!/usr/bin/env python
#
#
#
#-------------------------------------------------------------------------------
import re
import sys
import urllib
import httplib
import binascii
#-------------------------------------------------------------------------------
SITE = 'host'
URL = '/url/login'
params = urllib.urlencode({'aaa' : 1})
get_headers = {
'Accept-Language' : 'en-au',
'Accept' : 'text/plain',
'Content-Type' : 'text/html; charset=utf-8',
'Connection' : 'Keep-Alive',
'Host' : SITE,
'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)'
}
post_headers = {
'Accept-Language' : 'en-au',
'Accept-Encoding' : 'gzip, deflate',
'Content-Type' : 'application/x-www-form-urlencoded',
'Host' : SITE,
'Connection' : 'Keep-Alive',
'Cache-Control' : 'no-cache',
'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)'
}
# 'Content-Length' : len(request),
# 'Cookie' : 'JSESSIONID=%s' % JSESSIONID
#-------------------------------------------------------------------------------
def log_header(idx, resp):
of = open('%04d.hdr' % idx, 'w')
of.write("resp.__dict__ -> '%s'\n" % resp.__dict__)
of.write("Status %s Reason [%s]\n" % (resp.status, resp.reason))
of.write("Msg -> '%s'\n" % resp.msg)
of.write("Msg.__dict__ -> '%s'\n" % resp.msg.__dict__)
#xxx = "Msg.__dict__ -> '%s'" % resp.msg.__dict__['dict']['set-cookie']
#print xxx
of.close()
#-------------------------------------------------------------------------------
def log_body(idx, resp_body):
of = open('%04d.bdy' % idx, 'w')
of.write(resp_body);
of.close()
#-------------------------------------------------------------------------------
def do():
conn = httplib.HTTPConnection(SITE)
#---------------------------------------------------------------------
idx = 1
print ">>>>> GET %s <<<<<" % URL
conn.request("GET", '/%s/login.do' % URL, None, get_headers)
resp = conn.getresponse()
log_header(idx, resp)
m = re.search('JSESSIONID=(.*);', resp.msg.__dict__['dict']['set-cookie'])
if m:
print m.group(1)
JSESSIONID = m.group(1)
resp_body = resp.read()
log_body(idx, resp_body)
print resp_body
# <form action="http://xxxx:80/CustomerPortalWeb/login/login.do;jsessionid=vgp9GDVS6JyTly0v6NfsHG0rt1pLyvpMLxYnJf9MXsk3Yn0T2SZ3!1111094026" method="post">
#---------------------------------------------------------------------
idx = 2
print ">>>>> POST /%s/login.do <<<<<" % URL
print ">>>>> JSESSIONID = %s " % JSESSIONID
URL = "/CustomerPortalWeb/login/login.do;jsessionid=%s" % JSESSIONID
# form_data = {
# '{actionForm.username}' : 'svtest035@svt',
# '{actionForm.password}' : 'xxxx'
# }
form_data = {
'{actionForm.username}' : 'admin',
'{actionForm.password}' : 'xxxx'
}
params = urllib.urlencode(form_data)
post_headers['Content-Length'] = len(params)
post_headers['Cookie'] = 'JSESSIONID=%s' % JSESSIONID
conn.request("POST", URL, params, post_headers)
resp = conn.getresponse()
log_header(idx, resp)
resp_body = resp.read()
log_body(idx, resp_body)
#---------------------------------------------------------------------
idx = 3
print ">>>>> GET /%s/test.do <<<<<" % URL
get_headers['Cookie'] = 'JSESSIONID=%s' % JSESSIONID
conn.request("GET", '/%s/test.do' % URL, None, get_headers)
resp = conn.getresponse()
log_header(idx, resp)
resp_body = resp.read()
log_body(idx, resp_body)
#---------------------------------------------------------------------
idx = 4
args = {
'_nfpb' : 'true',
'_pageLabel' : 'ImportUserPage'
}
ue_args = urllib.urlencode(args)
print ">>>>> GET /%s/test.do <<<<<" % URL
get_headers['Cookie'] = 'JSESSIONID=%s' % JSESSIONID
conn.request("GET", '/%s/test.do?%s' % (URL, ue_args), None, get_headers)
resp = conn.getresponse()
log_header(idx, resp)
resp_body = resp.read()
log_body(idx, resp_body)
#---------------------------------------------------------------------
conn.close()
#-------------------------------------------------------------------------------
def main(args):
do()
#-------------------------------------------------------------------------------
if __name__ == "__main__":
main(sys.argv[1:])
#-------------------------------------------------------------------------------
"""
Regex Stuff:
regex = re.compile("\\n *")
(name, cnt) = re.subn('esb:', '', node_name)
value = re.sub(r'\n *', 'N/A', value)
"""
"""
FILE DOWNLOAD:
h.putrequest('POST', '/scripts/cgi.exe?')
h.putheader('Content-length', '%d'%len(params))
h.putheader('Accept', 'text/plain')
h.putheader('Host', 'test.site.com')
h.endheaders()
h.send(params)
reply, msg, hdrs = h.getreply()
data = h.getfile().read()
file('test.file', 'w').write(data)
h.close()
"""
"""
Accept-Language: en-au
Content-Type: application/x-www-form-urlencoded
Accept-Encoding: gzip, deflate
User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)
Host: xxx:8080
Content-Length: 54
Connection: Keep-Alive
Cache-Control: no-cache
Cookie: JSESSIONID=xxx
"""
Modular
See here for zipped up copy of files.
action
#!/usr/bin/env python
#
#
#-------------------------------------------------------------------------------
import re
import sys
#-------------------------------------------------------------------------------
import http
#-------------------------------------------------------------------------------
SITE = 'host'
site = None
ASPSESSIONID = None
postcodes = [2000, 2001, 2007, 2010, 3000, 3001, 3162, 3124, 3005, 6009, 7001]
#-------------------------------------------------------------------------------
def login():
global ASPSESSIONID
#------------------------------------------------------------------------
DO = 'GET'
URL = '/'
headers = http.GET_Headers(SITE)
r = http.Request(DO, URL, headers)
site.request(r)
#------------------------------------------------------------------------
DO = 'GET'
URL = '/login.asp'
headers = http.GET_Headers(SITE)
headers['Referer'] = 'http://%s/' % SITE
r = http.Request(DO, URL, headers)
site.request(r)
m = re.search('ASPSESSIONIDSCBCSRCC=(.*);', r.Response_headers.msg.__dict__['dict']['set-cookie'])
if m:
print "ASPSESSIONID --> %s" % m.group(1)
ASPSESSIONID = m.group(1)
#---------------------------------------------------------------------
DO = 'POST'
URL = '/Login.asp'
headers = http.POST_Headers(SITE)
headers['Referer'] = 'http://%s/login.asp' % SITE
headers['Cookie'] = 'ASPSESSIONIDSCBCSRCC=%s' % ASPSESSIONID
post_data = {
'txtUserName' : 'xxx',
'txtPassword' : 'xxx',
'txtUserID' : '0',
'imgOK.x' : '32',
'imgOK.y' : '11'
}
r = http.Request(DO, URL, headers, post_data)
site.request(r)
#---------------------------------------------------------------------
DO = 'POST'
URL = '/Login.asp'
headers = http.POST_Headers(SITE)
headers['Referer'] = 'http://%s/login.asp' % SITE
headers['Cookie'] = 'ASPSESSIONIDSCBCSRCC=%s' % ASPSESSIONID
form_data = {
'txtUserID' : '1142',
'imgOK.x' : '31',
'imgOK.y' : '12'
}
r = http.Request(DO, URL, headers, post_data)
site.request(r)
#---------------------------------------------------------------------
DO = 'POST'
URL = '/Login.asp'
headers = http.POST_Headers(SITE)
headers['Referer'] = 'http://%s/Login.asp' % SITE
headers['Cookie'] = 'ASPSESSIONIDSCBCSRCC=%s' % ASPSESSIONID
form_data = {
'cmbDC' : '311001%2CMETRO MAILS BUS UNIT CAP INV ',
'txtUserID' : '1142',
'imgOK2.x' : '20',
'imgOK2.y' : '5'
}
r = http.Request(DO, URL, headers, post_data)
site.request(r)
#---------------------------------------------------------------------
DO = 'GET'
URL = '/menuindex.asp?UserID=xxx,xxx'
headers = http.GET_Headers(SITE)
headers['Cookie'] = 'ASPSESSIONIDSCBCSRCC=%s' % ASPSESSIONID
r = http.Request(DO, URL, headers)
site.request(r)
#---------------------------------------------------------------------
DO = 'POST'
URL = '/Redirect.asp?UserID=xxx,xxx'
headers = http.POST_Headers(SITE)
headers['Referer'] = 'http://%s/Login.asp' % SITE
headers['Cookie'] = 'ASPSESSIONIDSCBCSRCC=%s' % ASPSESSIONID
r = http.Request(DO, URL, headers, post_data)
site.request(r)
#------------------------------------------------------------------------
def scrape(postcode):
DO = 'POST'
URL = '/Redirect.asp'
print 'Scraping %d' % postcode
headers = http.POST_Headers(SITE)
headers['Referer'] = 'http://%s/Login.asp' % SITE
headers['Cookie'] = 'ASPSESSIONIDSCBCSRCC=%s' % ASPSESSIONID
post_data = {
'txtCurrentUserID' : 'xxx',
'txtDC' : 'xxx',
'txtCRN' : '',
'txtLastName' : '',
'cmbSearchTypeLastName' : '2',
'txtAddress1' : '',
'cmbSearchTypeAddress1' : '2',
'txtAddress2' : '',
'cmbSearchTypeAddress2' : '3',
'txtSuburb' : '',
'cmbSearchTypeSuburb' : '1',
'txtPostcode' : '%4d' % postcode,
'cmbAction' : '1',
'imgSearch.x' : '24',
'imgSearch.y' : '6'
}
r = http.Request(DO, URL, headers, post_data)
site.request(r)
return r.idx
#------------------------------------------------------------------------
def process():
global site
dir = 'log'
site = http.scraper.Scraper(SITE, log_dir=dir)
login()
data = ''
ofh = open('redirections.dat', 'a+')
for postcode in postcodes:
idx = scrape(postcode)
rows = http.parse('%s/%04d.html' % (dir, idx))
for row in rows:
ofh.write("%s|%s\n" % (row, postcode))
ofh.close()
#------------------------------------------------------------------------
def main(args):
process()
#-------------------------------------------------------------------------------
if __name__ == "__main__":
main(sys.argv[1:])
"""
Date Who Description
---------- --- -----------------------------------------------------
2008-02-20 plh Initial implementation
"""
http Module
__init__.py
from request import Request
import logger
import scraper
from parser import parse
#-------------------------------------------------------------------------------
# 'Accept' : 'text/plain, text/html',
def GET_Headers(site):
return {
'Accept-Encoding' : 'gzip, deflate',
'Accept' : '*/*',
'Accept-Language' : 'en-au',
'Host' : '%s' % site,
'Connection' : 'Keep-Alive',
'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727'
}
#-------------------------------------------------------------------------------
def POST_Headers(site):
return {
'Accept:' : 'image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, application/x-shockwave-flash, */*',
'Accept-Language:' : 'en-au',
'Content-Type:' : 'application/x-www-form-urlencoded',
'Accept-Encoding:' : 'gzip, deflate',
'User-Agent:' : 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
'Host' : '%s',
'Connection:' : 'Keep-Alive',
'Cache-Control:' : 'no-cache',
}
#-------------------------------------------------------------------------------
logger.py
#!/usr/bin/env python
#
#
#-------------------------------------------------------------------------------
import pprint
#-------------------------------------------------------------------------------
class Logger():
directory = None
#----------------------------------------------------------------------------
def __init__(self, dir=None):
print "[http::scraper] dir %s" % dir
if dir:
self.directory = dir
else:
self.directory = '.'
#----------------------------------------------------------------------------
def log_request_params(self, idx, params):
of = open('%s/%04d.req_params' % (self.directory, idx), 'w')
of.write("%s\n" % pprint.pformat(params))
of.close()
#----------------------------------------------------------------------------
def log_request_header(self, idx, hdr):
of = open('%s/%04d.req_header' % (self.directory, idx), 'w')
of.write("%s\n" % pprint.pformat(hdr))
of.close()
#----------------------------------------------------------------------------
def log_request_data(self, idx, data):
of = open('%s/%04d.req_data' % (self.directory, idx), 'w')
of.write("%s\n" % pprint.pformat(data))
of.close()
#----------------------------------------------------------------------------
def log_response_header(self, idx, resp):
of = open('%s/%04d.resp_header' % (self.directory, idx), 'w')
of.write("resp.__dict__ ->\n%s\n\n" % pprint.pformat(resp.__dict__))
of.write("Status %s Reason [%s]\n" % (resp.status, resp.reason))
of.write("Msg ->\n%s\n\n" % resp.msg)
of.write("Msg.__dict__ ->\n%s\n\n" % pprint.pformat(resp.msg.__dict__))
of.close()
#----------------------------------------------------------------------------
def log_response_body(self, idx, resp_body):
of = open('%s/%04d.html' % (self.directory, idx), 'w')
of.write(resp_body);
of.close()
#-------------------------------------------------------------------------------
parser.py
The parser uses the BeautifulSoup module. which can be found here
#!/usr/bin/env python
import re
import pprint
#-------------------------------------------------------------------------------
from BeautifulSoup import BeautifulSoup
#-------------------------------------------------------------------------------
tr = 'tr>'
patterns = [
[ ' height="20"', ''],
[ ' class="TableValueBottomLeftBorder"', ''],
[r'<img src="images\\spacer.gif" width="1" \/>', ''],
[ '"frmRedirectionDetails"', ''],
[ ' class="TableValueBottomLeftRightBorder"', ''],
[r'<a href=\'javascript:OpenRedirectionDetails\("RedirectionDetails.asp\?RedirectionID=', ''],
[r' <\/a>', ''],
[r'", \)\'>', '|'],
[r' *', ''],
[r'<\/td><td>', '|'],
[r'<\/td><\/tr>', ''],
[r'<tr><td>', ''],
[r',1,', '|'],
]
#-------------------------------------------------------------------------------
def parse(fname):
data = []
comma = re.compile(',')
for p in patterns:
p.append(re.compile(p[0]))
doc = open(fname, 'r')
soup = BeautifulSoup(doc)
# print len(soup('table', { "class" : "table_style"}))
tables = soup.findAll('table')
for i in range(len(tables)):
if i == 3:
trs = tables[i].findAll('tr')
for j in range(len(trs)):
if j > 3:
s = str(trs[j])
for p in patterns:
s = p[2].sub(p[1], s)
if re.search(tr, s):
continue
s = comma.sub('|', s, 1)
data.append(s)
return data
#-------------------------------------------------------------------------------
def test():
results = parse('0010.html')
if results != None:
print results
#-------------------------------------------------------------------------------
def main(args):
test()
#-------------------------------------------------------------------------------
if __name__ == "__main__":
main(sys.argv[1:])
"""
Date Who Description
---------- --- -----------------------------------------------------
2008-02-20 plh Initial implementation
"""
request.py
#!/usr/bin/env python
#-------------------------------------------------------------------------------
class Request:
Method = 'GET'
URL = None
Request_headers = None
Post_data = None
Request_params = None
Response_headers = None
Response_body = None
#----------------------------------------------------------------------------
def __init__(self, method, url, headers, post_data=None):
self.Method = method
self.URL = url
self.Request_headers = headers
self.Post_data = post_data
#-------------------------------------------------------------------------------
scraper.py
#!/usr/bin/env python
#
#
#-------------------------------------------------------------------------------
import urllib
import httplib
#-------------------------------------------------------------------------------
from logger import Logger
#-------------------------------------------------------------------------------
class Scraper():
idx = None
connection = None
logger = None
def __init__(self, site, log_dir=None, protocol='http'):
print "[http::scraper] log_dir %s" % log_dir
if (protocol == 'https'):
self.connection = httplib.HTTPSConnection(site)
else:
self.connection = httplib.HTTPConnection(site)
self.idx = 0
self.logger = Logger(log_dir)
#----------------------------------------------------------------------------
def get_idx(self):
return self.idx
#----------------------------------------------------------------------------
def request(self, r, debug=None):
if debug: print '>>>> %s %s <<<<' % (r.Method, r.URL)
if r.Post_data:
r.Request_params = urllib.urlencode(r.Post_data)
if (debug and (debug > 2)): print r.Request_params
r.Request_headers['Content-Length'] = len(r.Request_params)
self.connection.request(r.Method, r.URL, r.Request_params, r.Request_headers)
resp = self.connection.getresponse()
self.logger.log_request_header(self.idx, r.Request_headers)
self.logger.log_response_header(self.idx, resp)
r.Response_headers = resp
r.Response_body = resp.read()
self.logger.log_response_body(self.idx, r.Response_body)
if (debug and (debug > 2)): print r.Response_body
r.idx = self.idx
self.idx += 1
return r
#-------------------------------------------------------------------------------