Python - httplib
Jump to navigation
Jump to search
Examples
Whitepages
See script which page scrape search results off Whitepages site...
Parsing WhitePages Search Results HTML
File Download Example
#!/usr/bin/env python # # # #------------------------------------------------------------------------------- import re import sys import urllib import httplib import binascii #------------------------------------------------------------------------------- SITE = 'host' URL = '/url/login' params = urllib.urlencode({'aaa' : 1}) get_headers = { 'Accept-Language' : 'en-au', 'Accept' : 'text/plain', 'Content-Type' : 'text/html; charset=utf-8', 'Connection' : 'Keep-Alive', 'Host' : SITE, 'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)' } post_headers = { 'Accept-Language' : 'en-au', 'Accept-Encoding' : 'gzip, deflate', 'Content-Type' : 'application/x-www-form-urlencoded', 'Host' : SITE, 'Connection' : 'Keep-Alive', 'Cache-Control' : 'no-cache', 'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)' } # 'Content-Length' : len(request), # 'Cookie' : 'JSESSIONID=%s' % JSESSIONID #------------------------------------------------------------------------------- def log_header(idx, resp): of = open('%04d.hdr' % idx, 'w') of.write("resp.__dict__ -> '%s'\n" % resp.__dict__) of.write("Status %s Reason [%s]\n" % (resp.status, resp.reason)) of.write("Msg -> '%s'\n" % resp.msg) of.write("Msg.__dict__ -> '%s'\n" % resp.msg.__dict__) #xxx = "Msg.__dict__ -> '%s'" % resp.msg.__dict__['dict']['set-cookie'] #print xxx of.close() #------------------------------------------------------------------------------- def log_body(idx, resp_body): of = open('%04d.bdy' % idx, 'w') of.write(resp_body); of.close() #------------------------------------------------------------------------------- def do(): conn = httplib.HTTPConnection(SITE) #--------------------------------------------------------------------- idx = 1 print ">>>>> GET %s <<<<<" % URL conn.request("GET", '/%s/login.do' % URL, None, get_headers) resp = conn.getresponse() log_header(idx, resp) m = re.search('JSESSIONID=(.*);', resp.msg.__dict__['dict']['set-cookie']) if m: print m.group(1) JSESSIONID = m.group(1) resp_body = resp.read() log_body(idx, resp_body) print resp_body # <form action="http://xxxx:80/CustomerPortalWeb/login/login.do;jsessionid=vgp9GDVS6JyTly0v6NfsHG0rt1pLyvpMLxYnJf9MXsk3Yn0T2SZ3!1111094026" method="post"> #--------------------------------------------------------------------- idx = 2 print ">>>>> POST /%s/login.do <<<<<" % URL print ">>>>> JSESSIONID = %s " % JSESSIONID URL = "/CustomerPortalWeb/login/login.do;jsessionid=%s" % JSESSIONID # form_data = { # '{actionForm.username}' : 'svtest035@svt', # '{actionForm.password}' : 'xxxx' # } form_data = { '{actionForm.username}' : 'admin', '{actionForm.password}' : 'xxxx' } params = urllib.urlencode(form_data) post_headers['Content-Length'] = len(params) post_headers['Cookie'] = 'JSESSIONID=%s' % JSESSIONID conn.request("POST", URL, params, post_headers) resp = conn.getresponse() log_header(idx, resp) resp_body = resp.read() log_body(idx, resp_body) #--------------------------------------------------------------------- idx = 3 print ">>>>> GET /%s/test.do <<<<<" % URL get_headers['Cookie'] = 'JSESSIONID=%s' % JSESSIONID conn.request("GET", '/%s/test.do' % URL, None, get_headers) resp = conn.getresponse() log_header(idx, resp) resp_body = resp.read() log_body(idx, resp_body) #--------------------------------------------------------------------- idx = 4 args = { '_nfpb' : 'true', '_pageLabel' : 'ImportUserPage' } ue_args = urllib.urlencode(args) print ">>>>> GET /%s/test.do <<<<<" % URL get_headers['Cookie'] = 'JSESSIONID=%s' % JSESSIONID conn.request("GET", '/%s/test.do?%s' % (URL, ue_args), None, get_headers) resp = conn.getresponse() log_header(idx, resp) resp_body = resp.read() log_body(idx, resp_body) #--------------------------------------------------------------------- conn.close() #------------------------------------------------------------------------------- def main(args): do() #------------------------------------------------------------------------------- if __name__ == "__main__": main(sys.argv[1:]) #------------------------------------------------------------------------------- """ Regex Stuff: regex = re.compile("\\n *") (name, cnt) = re.subn('esb:', '', node_name) value = re.sub(r'\n *', 'N/A', value) """ """ FILE DOWNLOAD: h.putrequest('POST', '/scripts/cgi.exe?') h.putheader('Content-length', '%d'%len(params)) h.putheader('Accept', 'text/plain') h.putheader('Host', 'test.site.com') h.endheaders() h.send(params) reply, msg, hdrs = h.getreply() data = h.getfile().read() file('test.file', 'w').write(data) h.close() """ """ Accept-Language: en-au Content-Type: application/x-www-form-urlencoded Accept-Encoding: gzip, deflate User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727) Host: xxx:8080 Content-Length: 54 Connection: Keep-Alive Cache-Control: no-cache Cookie: JSESSIONID=xxx """
Modular
See here for zipped up copy of files.
action
#!/usr/bin/env python # # #------------------------------------------------------------------------------- import re import sys #------------------------------------------------------------------------------- import http #------------------------------------------------------------------------------- SITE = 'host' site = None ASPSESSIONID = None postcodes = [2000, 2001, 2007, 2010, 3000, 3001, 3162, 3124, 3005, 6009, 7001] #------------------------------------------------------------------------------- def login(): global ASPSESSIONID #------------------------------------------------------------------------ DO = 'GET' URL = '/' headers = http.GET_Headers(SITE) r = http.Request(DO, URL, headers) site.request(r) #------------------------------------------------------------------------ DO = 'GET' URL = '/login.asp' headers = http.GET_Headers(SITE) headers['Referer'] = 'http://%s/' % SITE r = http.Request(DO, URL, headers) site.request(r) m = re.search('ASPSESSIONIDSCBCSRCC=(.*);', r.Response_headers.msg.__dict__['dict']['set-cookie']) if m: print "ASPSESSIONID --> %s" % m.group(1) ASPSESSIONID = m.group(1) #--------------------------------------------------------------------- DO = 'POST' URL = '/Login.asp' headers = http.POST_Headers(SITE) headers['Referer'] = 'http://%s/login.asp' % SITE headers['Cookie'] = 'ASPSESSIONIDSCBCSRCC=%s' % ASPSESSIONID post_data = { 'txtUserName' : 'xxx', 'txtPassword' : 'xxx', 'txtUserID' : '0', 'imgOK.x' : '32', 'imgOK.y' : '11' } r = http.Request(DO, URL, headers, post_data) site.request(r) #--------------------------------------------------------------------- DO = 'POST' URL = '/Login.asp' headers = http.POST_Headers(SITE) headers['Referer'] = 'http://%s/login.asp' % SITE headers['Cookie'] = 'ASPSESSIONIDSCBCSRCC=%s' % ASPSESSIONID form_data = { 'txtUserID' : '1142', 'imgOK.x' : '31', 'imgOK.y' : '12' } r = http.Request(DO, URL, headers, post_data) site.request(r) #--------------------------------------------------------------------- DO = 'POST' URL = '/Login.asp' headers = http.POST_Headers(SITE) headers['Referer'] = 'http://%s/Login.asp' % SITE headers['Cookie'] = 'ASPSESSIONIDSCBCSRCC=%s' % ASPSESSIONID form_data = { 'cmbDC' : '311001%2CMETRO MAILS BUS UNIT CAP INV ', 'txtUserID' : '1142', 'imgOK2.x' : '20', 'imgOK2.y' : '5' } r = http.Request(DO, URL, headers, post_data) site.request(r) #--------------------------------------------------------------------- DO = 'GET' URL = '/menuindex.asp?UserID=xxx,xxx' headers = http.GET_Headers(SITE) headers['Cookie'] = 'ASPSESSIONIDSCBCSRCC=%s' % ASPSESSIONID r = http.Request(DO, URL, headers) site.request(r) #--------------------------------------------------------------------- DO = 'POST' URL = '/Redirect.asp?UserID=xxx,xxx' headers = http.POST_Headers(SITE) headers['Referer'] = 'http://%s/Login.asp' % SITE headers['Cookie'] = 'ASPSESSIONIDSCBCSRCC=%s' % ASPSESSIONID r = http.Request(DO, URL, headers, post_data) site.request(r) #------------------------------------------------------------------------ def scrape(postcode): DO = 'POST' URL = '/Redirect.asp' print 'Scraping %d' % postcode headers = http.POST_Headers(SITE) headers['Referer'] = 'http://%s/Login.asp' % SITE headers['Cookie'] = 'ASPSESSIONIDSCBCSRCC=%s' % ASPSESSIONID post_data = { 'txtCurrentUserID' : 'xxx', 'txtDC' : 'xxx', 'txtCRN' : '', 'txtLastName' : '', 'cmbSearchTypeLastName' : '2', 'txtAddress1' : '', 'cmbSearchTypeAddress1' : '2', 'txtAddress2' : '', 'cmbSearchTypeAddress2' : '3', 'txtSuburb' : '', 'cmbSearchTypeSuburb' : '1', 'txtPostcode' : '%4d' % postcode, 'cmbAction' : '1', 'imgSearch.x' : '24', 'imgSearch.y' : '6' } r = http.Request(DO, URL, headers, post_data) site.request(r) return r.idx #------------------------------------------------------------------------ def process(): global site dir = 'log' site = http.scraper.Scraper(SITE, log_dir=dir) login() data = '' ofh = open('redirections.dat', 'a+') for postcode in postcodes: idx = scrape(postcode) rows = http.parse('%s/%04d.html' % (dir, idx)) for row in rows: ofh.write("%s|%s\n" % (row, postcode)) ofh.close() #------------------------------------------------------------------------ def main(args): process() #------------------------------------------------------------------------------- if __name__ == "__main__": main(sys.argv[1:]) """ Date Who Description ---------- --- ----------------------------------------------------- 2008-02-20 plh Initial implementation """
http Module
__init__.py
from request import Request import logger import scraper from parser import parse #------------------------------------------------------------------------------- # 'Accept' : 'text/plain, text/html', def GET_Headers(site): return { 'Accept-Encoding' : 'gzip, deflate', 'Accept' : '*/*', 'Accept-Language' : 'en-au', 'Host' : '%s' % site, 'Connection' : 'Keep-Alive', 'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727' } #------------------------------------------------------------------------------- def POST_Headers(site): return { 'Accept:' : 'image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, application/x-shockwave-flash, */*', 'Accept-Language:' : 'en-au', 'Content-Type:' : 'application/x-www-form-urlencoded', 'Accept-Encoding:' : 'gzip, deflate', 'User-Agent:' : 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)', 'Host' : '%s', 'Connection:' : 'Keep-Alive', 'Cache-Control:' : 'no-cache', } #-------------------------------------------------------------------------------
logger.py
#!/usr/bin/env python # # #------------------------------------------------------------------------------- import pprint #------------------------------------------------------------------------------- class Logger(): directory = None #---------------------------------------------------------------------------- def __init__(self, dir=None): print "[http::scraper] dir %s" % dir if dir: self.directory = dir else: self.directory = '.' #---------------------------------------------------------------------------- def log_request_params(self, idx, params): of = open('%s/%04d.req_params' % (self.directory, idx), 'w') of.write("%s\n" % pprint.pformat(params)) of.close() #---------------------------------------------------------------------------- def log_request_header(self, idx, hdr): of = open('%s/%04d.req_header' % (self.directory, idx), 'w') of.write("%s\n" % pprint.pformat(hdr)) of.close() #---------------------------------------------------------------------------- def log_request_data(self, idx, data): of = open('%s/%04d.req_data' % (self.directory, idx), 'w') of.write("%s\n" % pprint.pformat(data)) of.close() #---------------------------------------------------------------------------- def log_response_header(self, idx, resp): of = open('%s/%04d.resp_header' % (self.directory, idx), 'w') of.write("resp.__dict__ ->\n%s\n\n" % pprint.pformat(resp.__dict__)) of.write("Status %s Reason [%s]\n" % (resp.status, resp.reason)) of.write("Msg ->\n%s\n\n" % resp.msg) of.write("Msg.__dict__ ->\n%s\n\n" % pprint.pformat(resp.msg.__dict__)) of.close() #---------------------------------------------------------------------------- def log_response_body(self, idx, resp_body): of = open('%s/%04d.html' % (self.directory, idx), 'w') of.write(resp_body); of.close() #-------------------------------------------------------------------------------
parser.py
The parser uses the BeautifulSoup module. which can be found here
#!/usr/bin/env python import re import pprint #------------------------------------------------------------------------------- from BeautifulSoup import BeautifulSoup #------------------------------------------------------------------------------- tr = 'tr>' patterns = [ [ ' height="20"', ''], [ ' class="TableValueBottomLeftBorder"', ''], [r'<img src="images\\spacer.gif" width="1" \/>', ''], [ '"frmRedirectionDetails"', ''], [ ' class="TableValueBottomLeftRightBorder"', ''], [r'<a href=\'javascript:OpenRedirectionDetails\("RedirectionDetails.asp\?RedirectionID=', ''], [r' <\/a>', ''], [r'", \)\'>', '|'], [r' *', ''], [r'<\/td><td>', '|'], [r'<\/td><\/tr>', ''], [r'<tr><td>', ''], [r',1,', '|'], ] #------------------------------------------------------------------------------- def parse(fname): data = [] comma = re.compile(',') for p in patterns: p.append(re.compile(p[0])) doc = open(fname, 'r') soup = BeautifulSoup(doc) # print len(soup('table', { "class" : "table_style"})) tables = soup.findAll('table') for i in range(len(tables)): if i == 3: trs = tables[i].findAll('tr') for j in range(len(trs)): if j > 3: s = str(trs[j]) for p in patterns: s = p[2].sub(p[1], s) if re.search(tr, s): continue s = comma.sub('|', s, 1) data.append(s) return data #------------------------------------------------------------------------------- def test(): results = parse('0010.html') if results != None: print results #------------------------------------------------------------------------------- def main(args): test() #------------------------------------------------------------------------------- if __name__ == "__main__": main(sys.argv[1:]) """ Date Who Description ---------- --- ----------------------------------------------------- 2008-02-20 plh Initial implementation """
request.py
#!/usr/bin/env python #------------------------------------------------------------------------------- class Request: Method = 'GET' URL = None Request_headers = None Post_data = None Request_params = None Response_headers = None Response_body = None #---------------------------------------------------------------------------- def __init__(self, method, url, headers, post_data=None): self.Method = method self.URL = url self.Request_headers = headers self.Post_data = post_data #-------------------------------------------------------------------------------
scraper.py
#!/usr/bin/env python # # #------------------------------------------------------------------------------- import urllib import httplib #------------------------------------------------------------------------------- from logger import Logger #------------------------------------------------------------------------------- class Scraper(): idx = None connection = None logger = None def __init__(self, site, log_dir=None, protocol='http'): print "[http::scraper] log_dir %s" % log_dir if (protocol == 'https'): self.connection = httplib.HTTPSConnection(site) else: self.connection = httplib.HTTPConnection(site) self.idx = 0 self.logger = Logger(log_dir) #---------------------------------------------------------------------------- def get_idx(self): return self.idx #---------------------------------------------------------------------------- def request(self, r, debug=None): if debug: print '>>>> %s %s <<<<' % (r.Method, r.URL) if r.Post_data: r.Request_params = urllib.urlencode(r.Post_data) if (debug and (debug > 2)): print r.Request_params r.Request_headers['Content-Length'] = len(r.Request_params) self.connection.request(r.Method, r.URL, r.Request_params, r.Request_headers) resp = self.connection.getresponse() self.logger.log_request_header(self.idx, r.Request_headers) self.logger.log_response_header(self.idx, resp) r.Response_headers = resp r.Response_body = resp.read() self.logger.log_response_body(self.idx, r.Response_body) if (debug and (debug > 2)): print r.Response_body r.idx = self.idx self.idx += 1 return r #-------------------------------------------------------------------------------