Http get.py
Revision as of 11:14, 4 December 2008 by PeterHarding (talk | contribs)
#!/usr/bin/env python import re import zlib import gzip import socket import StringIO #----------------------------------------------------------------------- SVTAPPS = "svtapps" PORT = 80 p_Encoding = re.compile('Transfer-Encoding') #----------------------------------------------------------------------- # 'Accept' : 'text/plain, text/html', # 'Accept-Encoding' : 'gzip, deflate', """ get_headers = { 'Accept-Encoding' : 'gzip, deflate', 'Accept' : '*/*', 'Accept-Language' : 'en-au', 'Host' : SITE, 'Connection' : 'Keep-Alive', 'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727' } post_headers = { 'Accept:' : 'image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, application/x-shockwave-flash, */*', 'Accept-Language:' : 'en-au', 'Content-Type:' : 'application/x-www-form-urlencoded', 'Accept-Encoding:' : 'gzip, deflate', 'User-Agent:' : 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)', 'Host' : SITE, 'Connection:' : 'Keep-Alive', 'Cache-Control:' : 'no-cache', } """ #----------------------------------------------------------------------- def dump(x): while i < len(x): print "%s %04x" % (repr(x[i]), ord(x[i])) i += 1 #----------------------------------------------------------------------- headers = {} HEADER = 0 BODY = 1 context = HEADER data_encoding = None tranfer_encoding = None chunk_length = None body = '' def parse(s): global body global context global transfer_encoding global chunk_length cnt = 0 finished = False while True: if chunk_length: if len(s) < chunk_length: break else: data = s[:chunk_length] body += data s = s[chunk_length:] print "Chopped out %d bytes" % chunk_length chunk_length = None idx = s.find('\r\n') s = s[2:] # print "[%s] -> %d - %d" % (s, len(s), idx) continue else: # <CR><LF> delimited text idx = s.find('\r\n') if idx >= 0: l = s[:idx] s = s[idx+2:] if context == HEADER: if len(l) > 0: idx = l.find(': ') if idx > 0: tag = l[:idx] value = l[idx+2:] print "Tag [%s] Value [%s]" % (tag, value) headers[tag] = value else: context = BODY if headers.has_key('Transfer-Encoding'): if headers['Transfer-Encoding'] == 'chunked': transfer_encoding = 'chunked' else: if transfer_encoding == 'chunked': if not chunk_length: print "chunk -> [%s]" % l chunk_length = int(l, 16) print "chunk_length -> %d [%d]" % (chunk_length, len(s)) # print s if chunk_length == 0: # Should be done... print "# Should be done..." idx = s.find('\r\n') print "idx -> %d" % idx if idx == 0: finished = True s = body break if len(s) < chunk_length: break else: continue else: print "Should not get here!" break else: break return (finished, s) #----------------------------------------------------------------------- def setup(): # Create an INET, STREAMing socket s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.connect((SVTAPPS, PORT)) return s #----------------------------------------------------------------------- # Accept-Encoding: gzip, deflate request = """\ GET /mdcs HTTP/1.1 Accept: image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, application/x-shockwave-flash, */* Accept-Encoding: gzip, deflate Accept-Language: en-au User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727) Host: svtapps Connection: Keep-Alive """ def get(): s = setup() print "len(request) = %d" % len(request) n = s.send(request) print"send() -> %d" % n cnt = 0 residue = "" while True: recv = s.recv(1024) buf = residue + recv i = 0 (done, residue) = parse(buf) print ">**> %s" % done cnt += 1 if done: break print len(residue) # print zlib.decompress(residue) # print residue data = gzip.GzipFile('', 'rb', 9, StringIO.StringIO(residue)) page = data.read() print page #----------------------------------------------------------------------- def main(): get() #----------------------------------------------------------------------- main() #----------------------------------------------------------------------- """ # zlib.decompressobj().decompress('x\x9c' + binary_str) So you can do the same from you Python code. Just add ('Accept-Encoding', 'gzip,deflate') in the request header. Check the following code chunk: opener = urllib2.build_opener() opener.addheaders = [('Referer', referer), ('User-Agent', uagent), ('Accept-Encoding', 'gzip,deflate')] usock = opener.open(url) url = usock.geturl() data = decode(usock) usock.close() return data Note the decode() function used in the code. Yes, you have to decode the content (if it's compressed). def decode (page): encoding = page.info().get("Content-Encoding") if encoding in ('gzip', 'x-gzip', 'deflate'): content = page.read() if encoding == 'deflate': data = StringIO.StringIO(zlib.decompress(content)) else: data = gzip.GzipFile('', 'rb', 9, StringIO.StringIO(content)) page = data.read() return page """