Experimenting with HTTP

From PeformIQ Upgrade
Revision as of 17:09, 12 November 2008 by PeterHarding (talk | contribs) (New page: =An Experimental Reader= <pre> #!/usr/bin/env python import re import zlib import gzip import socket import StringIO #-------------------------------------------------------------------...)
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
Jump to navigation Jump to search

An Experimental Reader

#!/usr/bin/env python

import re
import zlib
import gzip
import socket
import StringIO

#-----------------------------------------------------------------------

SVTAPPS  = "svtapps"
URL      = 'www.performiq.com.au'

URL      = SVTAPPS
PORT     = 80

p_Encoding = re.compile('Transfer-Encoding')

#-----------------------------------------------------------------------

#   'Accept'             : 'text/plain, text/html',
#   'Accept-Encoding'    : 'gzip, deflate',

"""
get_headers = {
   'Accept-Encoding'    : 'gzip, deflate',
   'Accept'             : '*/*',
   'Accept-Language'    : 'en-au',
   'Host'               : SITE,
   'Connection'         : 'Keep-Alive',
   'User-Agent'         :  'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727'
}


post_headers = {
   'Accept:'            : 'image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, application/x-shockwave-flash, */*',
   'Accept-Language:'   : 'en-au',
   'Content-Type:'      : 'application/x-www-form-urlencoded',
   'Accept-Encoding:'   : 'gzip, deflate',
   'User-Agent:'        : 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
   'Host'               : SITE,
   'Connection:'        : 'Keep-Alive',
   'Cache-Control:'     : 'no-cache',
}
"""

#-----------------------------------------------------------------------

def dump(x):
   while i < len(x):
      print "%s %04x" % (repr(x[i]), ord(x[i]))
      i += 1

#-----------------------------------------------------------------------

headers = {}
status  = None

HEADER  = 0
BODY    = 1
context = HEADER

data_encoding    = None
tranfer_encoding = None
chunk_length     = None
body             = ''

def parse(s):
   global status
   global body
   global context
   global transfer_encoding
   global chunk_length

   cnt = 0

   finished = False

   while True:
      if chunk_length:
         if len(s) < chunk_length:
             break
         else:
             data = s[:chunk_length]
             body += data
             s = s[chunk_length:]
             print "Chopped out %d bytes" % chunk_length
             chunk_length = None
             idx = s.find('\r\n')
             s = s[2:]
             # print "[%s] -> %d - %d" % (s, len(s), idx)
             continue
      else:  # <CR><LF> delimited text
         idx = s.find('\r\n')

         if idx >= 0:
            l = s[:idx]

            s = s[idx+2:]

            if context == HEADER:
               if len(l) > 0:
                  if status:
                     idx = l.find(': ')

                     if idx > 0:
                        tag = l[:idx]
                        value = l[idx+2:]
                        print "Tag [%s]  Value [%s]" % (tag, value)
                        headers[tag] = value
                     continue

                  else:
                     if l.find('HTTP') == 0:
                        info   = l.split(' ', 2)
                        status = int(info[1])
                     continue

               else:
                  print status, headers
                  context = BODY
                  if headers.has_key('Transfer-Encoding'):
                     if headers['Transfer-Encoding'] == 'chunked':
                        transfer_encoding = 'chunked'
            else:
               if transfer_encoding == 'chunked':
                  if not chunk_length:
                     print "chunk -> [%s]" % l
                     chunk_length = int(l, 16)
                     print "chunk_length -> %d [%d]" % (chunk_length, len(s))
                     # print s
                     if chunk_length == 0:  # Should be done...
                        print  "# Should be done..."
                        idx = s.find('\r\n')
                        print "idx -> %d" % idx
                        if idx == 0:
                           finished = True
                           s = body
                           break
                     if len(s) < chunk_length:
                        break
                     else:
                        continue
                  else:
                     print "Should not get here!"
                     break
         else:
            break

   return (finished, s)

#-----------------------------------------------------------------------

def setup():
   # Create an INET, STREAMing socket

   s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)

   s.connect((URL, PORT))

   return s

#-----------------------------------------------------------------------

# Accept-Encoding: gzip, deflate

request = """\
GET /mdcs HTTP/1.1
Accept: image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, application/x-shockwave-flash, */*
Accept-Encoding: gzip, deflate
Accept-Language: en-au
User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)
Host: svtapps
Connection: Keep-Alive

"""

def get():
   s = setup()

   print "len(request) = %d" % len(request)

   n = s.send(request)

   print"send() -> %d" % n

   cnt = 0

   residue = ''

   while True:
      recv = s.recv(1024)

      buf = residue + recv

      i = 0

      (done, residue) = parse(buf)

      print ">**> %s" % done

      cnt += 1

      if done: break

   print len(residue)

   # print zlib.decompress(residue)
   # print residue
   data = gzip.GzipFile('', 'rb', 9, StringIO.StringIO(residue))
   page = data.read()
   print page


#-----------------------------------------------------------------------

def main():
   get()

#-----------------------------------------------------------------------

main()

#-----------------------------------------------------------------------

"""
# zlib.decompressobj().decompress('x\x9c' + binary_str)


So you can do the same from you Python code. Just add ('Accept-Encoding', 'gzip,deflate') in the request header. Check the following code chunk:

opener = urllib2.build_opener()
opener.addheaders = [('Referer', referer),
('User-Agent', uagent),
('Accept-Encoding', 'gzip,deflate')]
usock = opener.open(url)
url = usock.geturl()
data = decode(usock)
usock.close()
return data

Note the decode() function used in the code. Yes, you have to decode the content (if it's compressed).


def decode (page):
    encoding = page.info().get("Content-Encoding")    
    if encoding in ('gzip', 'x-gzip', 'deflate'):
        content = page.read()
        if encoding == 'deflate':
            data = StringIO.StringIO(zlib.decompress(content))
        else:
            data = gzip.GzipFile('', 'rb', 9, StringIO.StringIO(content))
        page = data.read()

    return page

"""

Handling gzip,deflate Transfer-Encoding

#!/usr/bin/env python

import sys
import gzip
import getopt
import urllib2
import StringIO

#-------------------------------------------------------------------------------

URL     = 'http://svtapps/mdcs'
referer = 'http://svtapps/'
uagent  = 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)'

#-------------------------------------------------------------------------------

def  get_page():
   opener = urllib2.build_opener()

   opener.addheaders = [
                         ('Referer', referer),
                         ('User-Agent', uagent),
                         ('Accept-Encoding', 'gzip,deflate')
                       ]

   usock = opener.open(URL)

   url   = usock.geturl()

   print "[[%s]]" % url

   page = decode(usock)

   usock.close()

   return page

#-------------------------------------------------------------------------------

def decode(page):
    print page.info()

    encoding = page.info().get("Content-Encoding")    



    if encoding in ('gzip', 'x-gzip', 'deflate'):
        f_gzip = open('gzip.dat', 'w')

        content = page.read()

        f_gzip.write(content)
        f_gzip.close()

        if encoding == 'deflate':
            data = StringIO.StringIO(zlib.decompress(content))
        else:
            data = gzip.GzipFile('', 'rb', 9, StringIO.StringIO(content))
        page = data.read()

    return page

#-------------------------------------------------------------------------------

def usage():
   USAGE = """

     Usage:
     
       $ get_page.py
       
   """
   
   sys.stderr.write(USAGE)
   
#-------------------------------------------------------------------------------

def main(argv):
   global debug_flg, verbose_flg, sundry_flg, id, no_orders, no_sundries
   
   loop_cnt    = 1
   examine_flg = False
   
   #----- Process command line arguments ----------------------------
   
   try:
      opts, args = getopt.getopt(argv, "dD:hv",
              ["debug", "debug_cnt=", "help","verbose"])
   except getopt.GetoptError: 
      usage()
      sys.exit(2)
   else:
      for opt, arg in opts:
         if opt in ("-h", "--help"):
            usage()
            sys.exit(0)
         elif opt in ("-d", "--debug"):
            debug_lvl      += 1
         elif opt in ("-D", "--debug_cnt"):
            debug_lvl       = int(arg)
         elif opt in ("-v", "--verbose"):
            verbose_flg     = True
            
   page = get_page()

   print page

#-------------------------------------------------------------------------------

if __name__ == "__main__":
   main(sys.argv[1:])

#-------------------------------------------------------------------------------

Using GZIP Module

#!/usr/bin/env python

import sys
import gzip
import getopt
import StringIO

#-------------------------------------------------------------------------------

def decode():
    f_gzip = open('gzip.dat', 'r')

    gz = f_gzip.read()

#            data = StringIO.StringIO(zlib.decompress(content))

    data = gzip.GzipFile('', 'rb', 9, StringIO.StringIO(gz))

    page = data.read()

    print page

    f_gzip.close()

#-------------------------------------------------------------------------------

def usage():
   USAGE = """

     Usage:
     
       $ unzip.py
       
   """
   
   sys.stderr.write(USAGE)
   
#-------------------------------------------------------------------------------

def main(argv):
   global debug_flg, verbose_flg, sundry_flg, id, no_orders, no_sundries
   
   loop_cnt    = 1
   examine_flg = False
   
   #----- Process command line arguments ----------------------------
   
   try:
      opts, args = getopt.getopt(argv, "dD:hv",
              ["debug", "debug_cnt=", "help","verbose"])
   except getopt.GetoptError: 
      usage()
      sys.exit(2)
   else:
      for opt, arg in opts:
         if opt in ("-h", "--help"):
            usage()
            sys.exit(0)
         elif opt in ("-d", "--debug"):
            debug_lvl      += 1
         elif opt in ("-D", "--debug_cnt"):
            debug_lvl       = int(arg)
         elif opt in ("-v", "--verbose"):
            verbose_flg     = True
            
   decode()

#-------------------------------------------------------------------------------

if __name__ == "__main__":
   main(sys.argv[1:])

#-------------------------------------------------------------------------------