Http get.py

#!/usr/bin/env python

import re
import zlib
import gzip
import socket
import StringIO

#-----------------------------------------------------------------------

HOST     = "host"
PORT     = 80

p_Encoding = re.compile('Transfer-Encoding')

#-----------------------------------------------------------------------

#   'Accept'             : 'text/plain, text/html',
#   'Accept-Encoding'    : 'gzip, deflate',

"""
get_headers = {
   'Accept-Encoding'    : 'gzip, deflate',
   'Accept'             : '*/*',
   'Accept-Language'    : 'en-au',
   'Host'               : SITE,
   'Connection'         : 'Keep-Alive',
   'User-Agent'         :  'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727'
}


post_headers = {
   'Accept:'            : 'image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, application/x-shockwave-flash, */*',
   'Accept-Language:'   : 'en-au',
   'Content-Type:'      : 'application/x-www-form-urlencoded',
   'Accept-Encoding:'   : 'gzip, deflate',
   'User-Agent:'        : 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
   'Host'               : SITE,
   'Connection:'        : 'Keep-Alive',
   'Cache-Control:'     : 'no-cache',
}
"""

#-----------------------------------------------------------------------

def dump(x):
   while i < len(x):
      print "%s %04x" % (repr(x[i]), ord(x[i]))
      i += 1

#-----------------------------------------------------------------------

headers = {}

HEADER  = 0
BODY    = 1
context = HEADER

data_encoding    = None
tranfer_encoding = None
chunk_length     = None
body             = ''

def parse(s):
   global body
   global context
   global transfer_encoding
   global chunk_length

   cnt = 0

   finished = False

   while True:
      if chunk_length:
         if len(s) < chunk_length:
             break
         else:
             data = s[:chunk_length]
             body += data
             s = s[chunk_length:]
             print "Chopped out %d bytes" % chunk_length
             chunk_length = None
             idx = s.find('\r\n')
             s = s[2:]
             # print "[%s] -> %d - %d" % (s, len(s), idx)
             continue
      else:  # <CR><LF> delimited text
         idx = s.find('\r\n')

         if idx >= 0:
            l = s[:idx]

            s = s[idx+2:]

            if context == HEADER:
               if len(l) > 0:
                  idx = l.find(': ')
                  if idx > 0:
                     tag = l[:idx]
                     value = l[idx+2:]
                     print "Tag [%s]  Value [%s]" % (tag, value)
                     headers[tag] = value
               else:
                  context = BODY
                  if headers.has_key('Transfer-Encoding'):
                     if headers['Transfer-Encoding'] == 'chunked':
                        transfer_encoding = 'chunked'
            else:
               if transfer_encoding == 'chunked':
                  if not chunk_length:
                     print "chunk -> [%s]" % l
                     chunk_length = int(l, 16)
                     print "chunk_length -> %d [%d]" % (chunk_length, len(s))
                     # print s
                     if chunk_length == 0:  # Should be done...
                        print  "# Should be done..."
                        idx = s.find('\r\n')
                        print "idx -> %d" % idx
                        if idx == 0:
                           finished = True
                           s = body
                           break
                     if len(s) < chunk_length:
                        break
                     else:
                        continue
                  else:
                     print "Should not get here!"
                     break
         else:
            break

   return (finished, s)

#-----------------------------------------------------------------------

def setup():
   # Create an INET, STREAMing socket

   s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)

   s.connect((HOST, PORT))

   return s

#-----------------------------------------------------------------------

# Accept-Encoding: gzip, deflate

request = """\
GET /test HTTP/1.1
Accept: image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, application/x-shockwave-flash, */*
Accept-Encoding: gzip, deflate
Accept-Language: en-au
User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)
Host: host
Connection: Keep-Alive
"""

def get():
   s = setup()

   print "len(request) = %d" % len(request)

   n = s.send(request)

   print"send() -> %d" % n

   cnt = 0

   residue = ""

   while True:
      recv = s.recv(1024)

      buf = residue + recv

      i = 0

      (done, residue) = parse(buf)

      print ">**> %s" % done

      cnt += 1

      if done: break

   print len(residue)

   # print zlib.decompress(residue)
   # print residue
   data = gzip.GzipFile('', 'rb', 9, StringIO.StringIO(residue))
   page = data.read()
   print page


#-----------------------------------------------------------------------

def main():
   get()

#-----------------------------------------------------------------------

main()

#-----------------------------------------------------------------------

"""
# zlib.decompressobj().decompress('x\x9c' + binary_str)


So you can do the same from you Python code. Just add ('Accept-Encoding', 'gzip,deflate') in the request header. Check the following code chunk:

opener = urllib2.build_opener()
opener.addheaders = [('Referer', referer),
('User-Agent', uagent),
('Accept-Encoding', 'gzip,deflate')]
usock = opener.open(url)
url = usock.geturl()
data = decode(usock)
usock.close()
return data

Note the decode() function used in the code. Yes, you have to decode the content (if it's compressed).

def decode (page):
    encoding = page.info().get("Content-Encoding")    
    if encoding in ('gzip', 'x-gzip', 'deflate'):
        content = page.read()
        if encoding == 'deflate':
            data = StringIO.StringIO(zlib.decompress(content))
        else:
            data = gzip.GzipFile('', 'rb', 9, StringIO.StringIO(content))
        page = data.read()

    return page

"""
Http get.py

Navigation menu

Search