<?xml version="1.0"?>
<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en">
	<id>https://performiq.com/kb/index.php?action=history&amp;feed=atom&amp;title=Editing_Parsing_WhitePages_Search_Results_HTML</id>
	<title>Editing Parsing WhitePages Search Results HTML - Revision history</title>
	<link rel="self" type="application/atom+xml" href="https://performiq.com/kb/index.php?action=history&amp;feed=atom&amp;title=Editing_Parsing_WhitePages_Search_Results_HTML"/>
	<link rel="alternate" type="text/html" href="https://performiq.com/kb/index.php?title=Editing_Parsing_WhitePages_Search_Results_HTML&amp;action=history"/>
	<updated>2026-05-18T11:38:27Z</updated>
	<subtitle>Revision history for this page on the wiki</subtitle>
	<generator>MediaWiki 1.37.1</generator>
	<entry>
		<id>https://performiq.com/kb/index.php?title=Editing_Parsing_WhitePages_Search_Results_HTML&amp;diff=5149&amp;oldid=prev</id>
		<title>PeterHarding: Created page with &quot;= Searching WhitePages =  The following Python script uses HTTPLib to search against whitepages.com.au (via a proxy).  &lt;pre&gt;  #!/usr/bin/env python  #  #  #-------------------...&quot;</title>
		<link rel="alternate" type="text/html" href="https://performiq.com/kb/index.php?title=Editing_Parsing_WhitePages_Search_Results_HTML&amp;diff=5149&amp;oldid=prev"/>
		<updated>2021-10-16T10:45:21Z</updated>

		<summary type="html">&lt;p&gt;Created page with &amp;quot;= Searching WhitePages =  The following Python script uses HTTPLib to search against whitepages.com.au (via a proxy).  &amp;lt;pre&amp;gt;  #!/usr/bin/env python  #  #  #-------------------...&amp;quot;&lt;/p&gt;
&lt;p&gt;&lt;b&gt;New page&lt;/b&gt;&lt;/p&gt;&lt;div&gt;= Searching WhitePages =&lt;br /&gt;
&lt;br /&gt;
The following Python script uses HTTPLib to search against whitepages.com.au (via a proxy).&lt;br /&gt;
&lt;br /&gt;
&amp;lt;pre&amp;gt;&lt;br /&gt;
 #!/usr/bin/env python&lt;br /&gt;
 #&lt;br /&gt;
 #&lt;br /&gt;
 #-------------------------------------------------------------------------------&lt;br /&gt;
 &lt;br /&gt;
 import re&lt;br /&gt;
 import sys&lt;br /&gt;
 import base64&lt;br /&gt;
 import pprint&lt;br /&gt;
 import urllib&lt;br /&gt;
 import httplib&lt;br /&gt;
 &lt;br /&gt;
 from copy import copy&lt;br /&gt;
 &lt;br /&gt;
 #-------------------------------------------------------------------------------&lt;br /&gt;
 &lt;br /&gt;
 PROXY       = &amp;#039;PROXY:8080&amp;#039;&lt;br /&gt;
 SITE        = &amp;#039;www.whitepages.com.au&amp;#039;&lt;br /&gt;
 &lt;br /&gt;
 connection  = None&lt;br /&gt;
 &lt;br /&gt;
 #===== Headers =================================================================&lt;br /&gt;
 &lt;br /&gt;
 #    &amp;#039;Accept&amp;#039; : &amp;#039;text/plain, text/html&amp;#039;,&lt;br /&gt;
 &lt;br /&gt;
 get_headers = {&lt;br /&gt;
    &amp;#039;Accept-Encoding&amp;#039;    : &amp;#039;gzip, deflate&amp;#039;,&lt;br /&gt;
    &amp;#039;Accept&amp;#039;             : &amp;#039;*/*&amp;#039;,&lt;br /&gt;
    &amp;#039;Accept-Language&amp;#039;    : &amp;#039;en-au&amp;#039;,&lt;br /&gt;
    &amp;#039;Host&amp;#039;               : SITE,&lt;br /&gt;
    &amp;#039;Connection&amp;#039;         : &amp;#039;Keep-Alive&amp;#039;,&lt;br /&gt;
    &amp;#039;User-Agent&amp;#039;         : &amp;#039;Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727&amp;#039;&lt;br /&gt;
 }&lt;br /&gt;
 &lt;br /&gt;
 post_headers = {&lt;br /&gt;
    &amp;#039;Content-type&amp;#039;       : &amp;#039;application/x-www-form-urlencoded&amp;#039;,&lt;br /&gt;
    &amp;#039;Accept&amp;#039;             : &amp;#039;text/plain&amp;#039;&lt;br /&gt;
 }&lt;br /&gt;
 &lt;br /&gt;
 idx      = 0&lt;br /&gt;
 &lt;br /&gt;
 #===== Logging =================================================================&lt;br /&gt;
 &lt;br /&gt;
 def log_req_header(idx, hdr):&lt;br /&gt;
    of = open(&amp;#039;log/%04d.req&amp;#039; % idx, &amp;#039;w&amp;#039;)&lt;br /&gt;
    of.write(&amp;quot;%s\n&amp;quot; % pprint.pformat(hdr))&lt;br /&gt;
    of.close()&lt;br /&gt;
 &lt;br /&gt;
 #-------------------------------------------------------------------------------&lt;br /&gt;
 &lt;br /&gt;
 def log_resp_header(idx, resp):&lt;br /&gt;
    of = open(&amp;#039;log/%04d.hdr&amp;#039; % idx, &amp;#039;w&amp;#039;)&lt;br /&gt;
 &lt;br /&gt;
    of.write(&amp;quot;resp.__dict__ -&amp;gt;\n%s\n\n&amp;quot; % pprint.pformat(resp.__dict__))&lt;br /&gt;
    of.write(&amp;quot;Status %s  Reason [%s]\n&amp;quot; % (resp.status, resp.reason))&lt;br /&gt;
    of.write(&amp;quot;Msg -&amp;gt;\n%s\n\n&amp;quot; % resp.msg)&lt;br /&gt;
    of.write(&amp;quot;Msg.__dict__ -&amp;gt;\n%s\n\n&amp;quot; % pprint.pformat(resp.msg.__dict__))&lt;br /&gt;
 &lt;br /&gt;
    of.close()&lt;br /&gt;
 &lt;br /&gt;
 #-------------------------------------------------------------------------------&lt;br /&gt;
 &lt;br /&gt;
 def log_resp_body(idx, resp_body):&lt;br /&gt;
    of = open(&amp;#039;log/%04d.bdy&amp;#039; % idx, &amp;#039;w&amp;#039;)&lt;br /&gt;
    of.write(resp_body);&lt;br /&gt;
    of.close()&lt;br /&gt;
 &lt;br /&gt;
 #===== Encapsulate the request code ============================================&lt;br /&gt;
 &lt;br /&gt;
 def request(method, url, params, headers):&lt;br /&gt;
    global idx&lt;br /&gt;
 &lt;br /&gt;
    print &amp;#039;&amp;gt;&amp;gt;&amp;gt;&amp;gt; %s %s &amp;lt;&amp;lt;&amp;lt;&amp;lt;&amp;#039; % (method, url)&lt;br /&gt;
 &lt;br /&gt;
    connection.request(method, url, params, headers)&lt;br /&gt;
 &lt;br /&gt;
    resp = connection.getresponse()&lt;br /&gt;
 &lt;br /&gt;
    log_req_header(idx, headers)&lt;br /&gt;
    log_resp_header(idx, resp)&lt;br /&gt;
 &lt;br /&gt;
    resp_body = resp.read()&lt;br /&gt;
 &lt;br /&gt;
    log_resp_body(idx, resp_body)&lt;br /&gt;
 &lt;br /&gt;
    print resp_body&lt;br /&gt;
 &lt;br /&gt;
    idx += 1&lt;br /&gt;
 &lt;br /&gt;
    return resp&lt;br /&gt;
 &lt;br /&gt;
 #===============================================================================&lt;br /&gt;
 &lt;br /&gt;
 def do():&lt;br /&gt;
    global connection&lt;br /&gt;
    connection  = httplib.HTTPConnection(PROXY)&lt;br /&gt;
 &lt;br /&gt;
    BASE_URL    = &amp;#039;http://%s&amp;#039; % SITE&lt;br /&gt;
 &lt;br /&gt;
    #------------------------------------------------------------------------&lt;br /&gt;
 &lt;br /&gt;
    DO       = &amp;#039;GET&amp;#039;&lt;br /&gt;
    URL      = BASE_URL + &amp;#039;/&amp;#039;&lt;br /&gt;
 &lt;br /&gt;
    headers = copy(get_headers)&lt;br /&gt;
 &lt;br /&gt;
    request(DO, URL, None, headers)&lt;br /&gt;
 &lt;br /&gt;
 &lt;br /&gt;
    #------------------------------------------------------------------------&lt;br /&gt;
 &lt;br /&gt;
    DO  = &amp;#039;GET&amp;#039;&lt;br /&gt;
    URL = BASE_URL + &amp;#039;/wp/index.jsp&amp;#039;&lt;br /&gt;
 &lt;br /&gt;
    headers = copy(get_headers)&lt;br /&gt;
 &lt;br /&gt;
    resp = request(DO, URL, None, headers)&lt;br /&gt;
 &lt;br /&gt;
    m = re.search(&amp;#039;JSESSIONID=(.*);&amp;#039;, resp.msg.__dict__[&amp;#039;dict&amp;#039;][&amp;#039;set-cookie&amp;#039;])&lt;br /&gt;
 &lt;br /&gt;
    if m:&lt;br /&gt;
       print m.group(1)&lt;br /&gt;
       JSESSIONID = m.group(1)&lt;br /&gt;
 &lt;br /&gt;
    print JSESSIONID&lt;br /&gt;
 &lt;br /&gt;
    #---------------------------------------------------------------------&lt;br /&gt;
 &lt;br /&gt;
    DO  = &amp;#039;POST&amp;#039;&lt;br /&gt;
    URL = BASE_URL + &amp;#039;/wp/busSearch.do;jsessionid=%s&amp;#039; % JSESSIONID&lt;br /&gt;
 &lt;br /&gt;
    headers = copy(post_headers)&lt;br /&gt;
 &lt;br /&gt;
    form_data = {&lt;br /&gt;
       &amp;#039;subscriberName&amp;#039; : &amp;#039;Hard&amp;#039;,&lt;br /&gt;
       &amp;#039;state&amp;#039;          : &amp;#039;VIC&amp;#039;,&lt;br /&gt;
       &amp;#039;suburb&amp;#039;         : &amp;#039;&amp;#039;,&lt;br /&gt;
       &amp;#039;street&amp;#039;         : &amp;#039;&amp;#039;,&lt;br /&gt;
       &amp;#039;Search&amp;#039;         : &amp;#039;Search&amp;#039;&lt;br /&gt;
    }&lt;br /&gt;
 &lt;br /&gt;
    params = urllib.urlencode(form_data)&lt;br /&gt;
 &lt;br /&gt;
    headers[&amp;#039;Content-Length&amp;#039;] = len(params)&lt;br /&gt;
    headers[&amp;#039;Cookie&amp;#039;]         = &amp;#039;JSESSIONID=%s&amp;#039; % JSESSIONID&lt;br /&gt;
 &lt;br /&gt;
    request(DO, URL, params, headers)&lt;br /&gt;
 &lt;br /&gt;
    #---------------------------------------------------------------------&lt;br /&gt;
 &lt;br /&gt;
    URL = BASE_URL + &amp;#039;/wp/busSearch.do&amp;#039;&lt;br /&gt;
 &lt;br /&gt;
    form_data = {&lt;br /&gt;
       &amp;#039;subscriberName&amp;#039; : &amp;#039;Hard&amp;#039;,&lt;br /&gt;
       &amp;#039;state&amp;#039;          : &amp;#039;VIC&amp;#039;,&lt;br /&gt;
       &amp;#039;page&amp;#039;           : &amp;#039;2&amp;#039;&lt;br /&gt;
    }&lt;br /&gt;
 &lt;br /&gt;
    params = urllib.urlencode(form_data)&lt;br /&gt;
 &lt;br /&gt;
    headers[&amp;#039;Content-Length&amp;#039;] = len(params)&lt;br /&gt;
    headers[&amp;#039;Cookie&amp;#039;]         = &amp;#039;JSESSIONID=%s&amp;#039; % JSESSIONID&lt;br /&gt;
 &lt;br /&gt;
    request(DO, URL, params, headers)&lt;br /&gt;
  &lt;br /&gt;
 #===============================================================================&lt;br /&gt;
 &lt;br /&gt;
 do()&lt;br /&gt;
 &lt;br /&gt;
 #-------------------------------------------------------------------------------&lt;br /&gt;
&amp;lt;/pre&amp;gt;&lt;br /&gt;
&lt;br /&gt;
This script writes the search results into files (page 1 =&amp;gt; &amp;#039;log/0002.bdy&amp;#039; and page 2 =&amp;gt; &amp;#039;log/0003.bdy&amp;#039;).  Amend the above code handle more pages of search results being produced.&lt;br /&gt;
&lt;br /&gt;
= Parsing the Search Results =&lt;br /&gt;
&lt;br /&gt;
The search results HTML looks as follows:&lt;br /&gt;
&lt;br /&gt;
&amp;lt;pre&amp;gt;&lt;br /&gt;
&amp;lt;div class=&amp;quot;encap_result&amp;quot; id=&amp;quot;result-10&amp;quot;&amp;gt;&amp;lt;ul&amp;gt;&amp;lt;li id=&amp;#039;res10-ln0&amp;#039;&amp;gt;&amp;lt;h4&amp;gt;&amp;lt;span class=&amp;#039;blackboldcaps&amp;#039;&amp;gt;Hard ...&amp;lt;/ul&amp;gt;&amp;lt;/li&amp;gt;&amp;lt;/ul&amp;gt;&amp;lt;div class=&amp;quot;clearMe&amp;quot;&amp;gt;&amp;amp;nbsp;&amp;lt;/div&amp;gt;&amp;lt;/div&amp;gt;&amp;lt;div class=&amp;quot;encap_result&amp;quot; id=&amp;quot;result-11&amp;quot;&amp;gt;&amp;lt;ul&amp;gt;&amp;lt;li&amp;gt;&lt;br /&gt;
&amp;lt;/pre&amp;gt;&lt;br /&gt;
&lt;br /&gt;
=Results of Search=&lt;br /&gt;
&lt;br /&gt;
Are wriiten to a file in a block of XML.&lt;br /&gt;
&lt;br /&gt;
=A more Complex Script=&lt;br /&gt;
&lt;br /&gt;
This script loops through all the available pages in the search results and parses out the search results using the BeautifulSoup HTTP parsing module.&lt;br /&gt;
&lt;br /&gt;
==wp_get.py==&lt;br /&gt;
&lt;br /&gt;
&amp;lt;pre&amp;gt;&lt;br /&gt;
#!/usr/bin/env python&lt;br /&gt;
#&lt;br /&gt;
#&lt;br /&gt;
#-------------------------------------------------------------------------------&lt;br /&gt;
&lt;br /&gt;
import re&lt;br /&gt;
import sys&lt;br /&gt;
import base64&lt;br /&gt;
import pprint&lt;br /&gt;
import urllib&lt;br /&gt;
import httplib&lt;br /&gt;
&lt;br /&gt;
from copy import copy&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
#-------------------------------------------------------------------------------&lt;br /&gt;
&lt;br /&gt;
from wp_parser import parse&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
#-------------------------------------------------------------------------------&lt;br /&gt;
&lt;br /&gt;
PROXY          = &amp;#039;PROXY:8080&amp;#039;&lt;br /&gt;
SITE           = &amp;#039;www.whitepages.com.au&amp;#039;&lt;br /&gt;
&lt;br /&gt;
connection     = None&lt;br /&gt;
next_page_pat  = None&lt;br /&gt;
&lt;br /&gt;
results        = &amp;#039;&amp;#039;&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
#-------------------------------------------------------------------------------&lt;br /&gt;
&lt;br /&gt;
#    &amp;#039;Accept&amp;#039; : &amp;#039;text/plain, text/html&amp;#039;,&lt;br /&gt;
&lt;br /&gt;
get_headers = {&lt;br /&gt;
   &amp;#039;Accept-Encoding&amp;#039;    : &amp;#039;gzip, deflate&amp;#039;,&lt;br /&gt;
   &amp;#039;Accept&amp;#039;             : &amp;#039;*/*&amp;#039;,&lt;br /&gt;
   &amp;#039;Accept-Language&amp;#039;    : &amp;#039;en-au&amp;#039;,&lt;br /&gt;
   &amp;#039;Host&amp;#039;               : SITE,&lt;br /&gt;
   &amp;#039;Connection&amp;#039;         : &amp;#039;Keep-Alive&amp;#039;,&lt;br /&gt;
   &amp;#039;User-Agent&amp;#039;         : &amp;#039;Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727&amp;#039;&lt;br /&gt;
}&lt;br /&gt;
&lt;br /&gt;
post_headers = {&lt;br /&gt;
   &amp;#039;Content-type&amp;#039;     : &amp;#039;application/x-www-form-urlencoded&amp;#039;,&lt;br /&gt;
   &amp;#039;Accept&amp;#039;           : &amp;#039;text/plain&amp;#039;&lt;br /&gt;
}&lt;br /&gt;
&lt;br /&gt;
idx      = 0&lt;br /&gt;
&lt;br /&gt;
#-------------------------------------------------------------------------------&lt;br /&gt;
&lt;br /&gt;
def log_req_header(idx, hdr):&lt;br /&gt;
   of = open(&amp;#039;log/%04d.req&amp;#039; % idx, &amp;#039;w&amp;#039;)&lt;br /&gt;
   of.write(&amp;quot;%s\n&amp;quot; % pprint.pformat(hdr))&lt;br /&gt;
   of.close()&lt;br /&gt;
&lt;br /&gt;
#-------------------------------------------------------------------------------&lt;br /&gt;
&lt;br /&gt;
def log_resp_header(idx, resp):&lt;br /&gt;
   of = open(&amp;#039;log/%04d.hdr&amp;#039; % idx, &amp;#039;w&amp;#039;)&lt;br /&gt;
&lt;br /&gt;
   of.write(&amp;quot;resp.__dict__ -&amp;gt;\n%s\n\n&amp;quot; % pprint.pformat(resp.__dict__))&lt;br /&gt;
   of.write(&amp;quot;Status %s  Reason [%s]\n&amp;quot; % (resp.status, resp.reason))&lt;br /&gt;
   of.write(&amp;quot;Msg -&amp;gt;\n%s\n\n&amp;quot; % resp.msg)&lt;br /&gt;
   of.write(&amp;quot;Msg.__dict__ -&amp;gt;\n%s\n\n&amp;quot; % pprint.pformat(resp.msg.__dict__))&lt;br /&gt;
&lt;br /&gt;
   of.close()&lt;br /&gt;
&lt;br /&gt;
#-------------------------------------------------------------------------------&lt;br /&gt;
&lt;br /&gt;
def log_resp_body(idx, resp_body):&lt;br /&gt;
   of = open(&amp;#039;log/%04d.bdy&amp;#039; % idx, &amp;#039;w&amp;#039;)&lt;br /&gt;
   of.write(resp_body);&lt;br /&gt;
   of.close()&lt;br /&gt;
&lt;br /&gt;
#-------------------------------------------------------------------------------&lt;br /&gt;
&lt;br /&gt;
def do(pattern, state):&lt;br /&gt;
   global connection&lt;br /&gt;
   global results&lt;br /&gt;
&lt;br /&gt;
   print &amp;quot;Pattern ==&amp;gt; &amp;#039;%s&amp;#039;&amp;quot; % pattern&lt;br /&gt;
&lt;br /&gt;
   connection  = httplib.HTTPConnection(PROXY)&lt;br /&gt;
&lt;br /&gt;
   BASE_URL    = &amp;#039;http://%s&amp;#039; % SITE&lt;br /&gt;
&lt;br /&gt;
   #------------------------------------------------------------------------&lt;br /&gt;
&lt;br /&gt;
   DO       = &amp;#039;GET&amp;#039;&lt;br /&gt;
   URL      = BASE_URL + &amp;#039;/&amp;#039;&lt;br /&gt;
&lt;br /&gt;
   headers = copy(get_headers)&lt;br /&gt;
&lt;br /&gt;
   request(DO, URL, None, headers)&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
   #------------------------------------------------------------------------&lt;br /&gt;
&lt;br /&gt;
   DO  = &amp;#039;GET&amp;#039;&lt;br /&gt;
   URL = BASE_URL + &amp;#039;/wp/index.jsp&amp;#039;&lt;br /&gt;
&lt;br /&gt;
   headers = copy(get_headers)&lt;br /&gt;
&lt;br /&gt;
   r = request(DO, URL, None, headers)&lt;br /&gt;
&lt;br /&gt;
   m = re.search(&amp;#039;JSESSIONID=(.*);&amp;#039;, r[&amp;#039;response_header&amp;#039;].msg.__dict__[&amp;#039;dict&amp;#039;][&amp;#039;set-cookie&amp;#039;])&lt;br /&gt;
&lt;br /&gt;
   if m:&lt;br /&gt;
      print m.group(1)&lt;br /&gt;
      JSESSIONID = m.group(1)&lt;br /&gt;
&lt;br /&gt;
   print JSESSIONID&lt;br /&gt;
&lt;br /&gt;
   #---------------------------------------------------------------------&lt;br /&gt;
&lt;br /&gt;
   DO  = &amp;#039;POST&amp;#039;&lt;br /&gt;
   URL = BASE_URL + &amp;#039;/wp/busSearch.do;jsessionid=%s&amp;#039; % JSESSIONID&lt;br /&gt;
&lt;br /&gt;
   headers = copy(post_headers)&lt;br /&gt;
&lt;br /&gt;
   form_data = {&lt;br /&gt;
      &amp;#039;subscriberName&amp;#039; : pattern,&lt;br /&gt;
      &amp;#039;state&amp;#039;          : state,&lt;br /&gt;
      &amp;#039;suburb&amp;#039;         : &amp;#039;&amp;#039;,&lt;br /&gt;
      &amp;#039;street&amp;#039;         : &amp;#039;&amp;#039;,&lt;br /&gt;
      &amp;#039;Search&amp;#039;         : &amp;#039;Search&amp;#039;&lt;br /&gt;
   }&lt;br /&gt;
&lt;br /&gt;
   params = urllib.urlencode(form_data)&lt;br /&gt;
&lt;br /&gt;
   headers[&amp;#039;Content-Length&amp;#039;] = len(params)&lt;br /&gt;
   headers[&amp;#039;Cookie&amp;#039;]         = &amp;#039;JSESSIONID=%s&amp;#039; % JSESSIONID&lt;br /&gt;
&lt;br /&gt;
   r = request(DO, URL, params, headers)&lt;br /&gt;
&lt;br /&gt;
   xml = parse(&amp;#039;log/%04d.bdy&amp;#039; % r[&amp;#039;idx&amp;#039;])&lt;br /&gt;
&lt;br /&gt;
   results += xml&lt;br /&gt;
&lt;br /&gt;
   body = r[&amp;#039;response_body&amp;#039;]&lt;br /&gt;
&lt;br /&gt;
   m = next_page_pat.findall(body)&lt;br /&gt;
&lt;br /&gt;
   cnt = 0&lt;br /&gt;
&lt;br /&gt;
   if m:&lt;br /&gt;
      no_pages = len(m)&lt;br /&gt;
      if no_pages &amp;gt; 0:&lt;br /&gt;
         for i in range(no_pages):&lt;br /&gt;
            print m[i]&lt;br /&gt;
            #print m[i].group(1)&lt;br /&gt;
&lt;br /&gt;
         cnt = no_pages - 1&lt;br /&gt;
&lt;br /&gt;
   #---------------------------------------------------------------------&lt;br /&gt;
&lt;br /&gt;
   for i in range(cnt):&lt;br /&gt;
&lt;br /&gt;
      URL = BASE_URL + &amp;#039;/wp/busSearch.do&amp;#039;&lt;br /&gt;
&lt;br /&gt;
      form_data = {&lt;br /&gt;
         &amp;#039;subscriberName&amp;#039; : pattern,&lt;br /&gt;
         &amp;#039;state&amp;#039;          : state,&lt;br /&gt;
         &amp;#039;page&amp;#039;           : i + 1&lt;br /&gt;
      }&lt;br /&gt;
&lt;br /&gt;
      params = urllib.urlencode(form_data)&lt;br /&gt;
&lt;br /&gt;
      headers[&amp;#039;Content-Length&amp;#039;] = len(params)&lt;br /&gt;
      headers[&amp;#039;Cookie&amp;#039;]         = &amp;#039;JSESSIONID=%s&amp;#039; % JSESSIONID&lt;br /&gt;
&lt;br /&gt;
      r = request(DO, URL, params, headers)&lt;br /&gt;
&lt;br /&gt;
      xml = parse(&amp;#039;log/%04d.bdy&amp;#039; % r[&amp;#039;idx&amp;#039;])&lt;br /&gt;
&lt;br /&gt;
      results += xml&lt;br /&gt;
&lt;br /&gt;
   ofh = open(&amp;#039;results.xml&amp;#039;, &amp;#039;a+&amp;#039;)&lt;br /&gt;
&lt;br /&gt;
   ofh.write(results)&lt;br /&gt;
&lt;br /&gt;
   ofh.close()&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
#-------------------------------------------------------------------------------&lt;br /&gt;
&lt;br /&gt;
def request(method, url, params, headers):&lt;br /&gt;
   global idx&lt;br /&gt;
&lt;br /&gt;
   print &amp;#039;&amp;gt;&amp;gt;&amp;gt;&amp;gt; %s %s &amp;lt;&amp;lt;&amp;lt;&amp;lt;&amp;#039; % (method, url)&lt;br /&gt;
&lt;br /&gt;
   connection.request(method, url, params, headers)&lt;br /&gt;
&lt;br /&gt;
   resp = connection.getresponse()&lt;br /&gt;
&lt;br /&gt;
   log_req_header(idx, headers)&lt;br /&gt;
   log_resp_header(idx, resp)&lt;br /&gt;
&lt;br /&gt;
   resp_body = resp.read()&lt;br /&gt;
&lt;br /&gt;
   log_resp_body(idx, resp_body)&lt;br /&gt;
&lt;br /&gt;
   # print resp_body&lt;br /&gt;
&lt;br /&gt;
   r = {&amp;#039;idx&amp;#039; : idx, &amp;#039;request_header&amp;#039; : headers, &amp;#039;response_header&amp;#039; : resp, &amp;#039;response_body&amp;#039; : resp_body}&lt;br /&gt;
&lt;br /&gt;
   idx += 1&lt;br /&gt;
&lt;br /&gt;
   return r&lt;br /&gt;
&lt;br /&gt;
#-------------------------------------------------------------------------------&lt;br /&gt;
&lt;br /&gt;
def process(search_patterns):&lt;br /&gt;
   global next_page_pat&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
   next_page_pat = re.compile(r&amp;#039;;(page=[0-9]*&amp;quot;&amp;gt;[0-9]*&amp;lt;\/a&amp;gt;)&amp;#039;)&lt;br /&gt;
&lt;br /&gt;
   for search_pattern in search_patterns:&lt;br /&gt;
     do(search_pattern, &amp;#039;VIC&amp;#039;)&lt;br /&gt;
&lt;br /&gt;
#-------------------------------------------------------------------------------&lt;br /&gt;
&lt;br /&gt;
def searches():&lt;br /&gt;
&lt;br /&gt;
   searches = []&lt;br /&gt;
&lt;br /&gt;
   for i in range(4):&lt;br /&gt;
      ch = chr(ord(&amp;#039;W&amp;#039;) + i)&lt;br /&gt;
      for x in (&amp;#039;a&amp;#039;, &amp;#039;e&amp;#039;, &amp;#039;i&amp;#039;, &amp;#039;o&amp;#039;, &amp;#039;u&amp;#039;):&lt;br /&gt;
         pat  = ch + x&lt;br /&gt;
         searches.append(pat)&lt;br /&gt;
&lt;br /&gt;
   return searches&lt;br /&gt;
&lt;br /&gt;
#-------------------------------------------------------------------------------&lt;br /&gt;
&lt;br /&gt;
def used():&lt;br /&gt;
   searches = []&lt;br /&gt;
   searches.append(&amp;#039;Zeus&amp;#039;)&lt;br /&gt;
&lt;br /&gt;
#-------------------------------------------------------------------------------&lt;br /&gt;
&lt;br /&gt;
def main():&lt;br /&gt;
   searches = []&lt;br /&gt;
&lt;br /&gt;
   searches.append(&amp;#039;Zany&amp;#039;)&lt;br /&gt;
   searches.append(&amp;#039;Zan&amp;#039;)&lt;br /&gt;
   searches.append(&amp;#039;Zen&amp;#039;)&lt;br /&gt;
   searches.append(&amp;#039;Zend&amp;#039;)&lt;br /&gt;
&lt;br /&gt;
   process(searches)&lt;br /&gt;
&lt;br /&gt;
#-------------------------------------------------------------------------------&lt;br /&gt;
&lt;br /&gt;
main()&lt;br /&gt;
&lt;br /&gt;
#-------------------------------------------------------------------------------&lt;br /&gt;
&amp;lt;/pre&amp;gt;&lt;br /&gt;
&lt;br /&gt;
==wp_parse.py==&lt;br /&gt;
&lt;br /&gt;
&amp;lt;pre&amp;gt;&lt;br /&gt;
#!/usr/bin/env python&lt;br /&gt;
&lt;br /&gt;
import pprint&lt;br /&gt;
&lt;br /&gt;
from BeautifulSoup import BeautifulSoup&lt;br /&gt;
&lt;br /&gt;
#-------------------------------------------------------------------------------------&lt;br /&gt;
&lt;br /&gt;
def parse(fname):&lt;br /&gt;
   doc = open(fname, &amp;#039;r&amp;#039;)&lt;br /&gt;
&lt;br /&gt;
   soup = BeautifulSoup(doc)&lt;br /&gt;
&lt;br /&gt;
   # print len(soup(&amp;#039;table&amp;#039;, { &amp;quot;class&amp;quot; : &amp;quot;table_style&amp;quot;}))&lt;br /&gt;
&lt;br /&gt;
   # tables = soup.findAll(&amp;#039;table&amp;#039;, { &amp;quot;class&amp;quot; : &amp;quot;table_style&amp;quot;})&lt;br /&gt;
   objs = soup.findAll(&amp;#039;div&amp;#039;, { &amp;quot;class&amp;quot; : &amp;quot;encap_result&amp;quot;})&lt;br /&gt;
&lt;br /&gt;
   pp = pprint.PrettyPrinter(3)&lt;br /&gt;
&lt;br /&gt;
   xml = &amp;#039;&amp;#039;&lt;br /&gt;
&lt;br /&gt;
   for obj in objs:&lt;br /&gt;
      t = obj.find(text=True)&lt;br /&gt;
&lt;br /&gt;
      if t:&lt;br /&gt;
         xml +=  &amp;#039;&amp;lt;entry&amp;gt;\n&amp;#039;&lt;br /&gt;
&lt;br /&gt;
         #print &amp;#039;[[%s]]\n\n&amp;#039; % obj.__dict__&lt;br /&gt;
         # print &amp;#039;[[%s]]\n\n&amp;#039; % obj&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
         f    = obj.findAll(&amp;#039;span&amp;#039;,  { &amp;#039;class&amp;#039; : &amp;#039;black&amp;#039;})&lt;br /&gt;
&lt;br /&gt;
         for s in f:&lt;br /&gt;
            xml +=  &amp;#039;  &amp;lt;tag&amp;gt;%s&amp;lt;/tag&amp;gt;\n&amp;#039; % s.find(text=True)&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
         f    = obj.findAll(&amp;#039;input&amp;#039;,  { &amp;quot;name&amp;quot; : &amp;#039;placeName&amp;#039;})&lt;br /&gt;
&lt;br /&gt;
         for s in f:&lt;br /&gt;
            # pp.pprint(s.__dict__)&lt;br /&gt;
            # print &amp;#039;attrMap -&amp;gt; &amp;quot;%s&amp;quot;&amp;#039; % s.attrMap&lt;br /&gt;
            xml += &amp;#039;  &amp;lt;placeName&amp;gt;%s&amp;lt;/placeName&amp;gt;\n&amp;#039; % s.attrMap[&amp;#039;value&amp;#039;]&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
         for s in obj.findAll(&amp;#039;input&amp;#039;,  { &amp;quot;name&amp;quot; : &amp;#039;subscriberName&amp;#039;}):&lt;br /&gt;
            xml += &amp;#039;  &amp;lt;subscriberName&amp;gt;%s&amp;lt;/subscriberName&amp;gt;\n&amp;#039; % s.attrMap[&amp;#039;value&amp;#039;]&lt;br /&gt;
&lt;br /&gt;
         for s in obj.findAll(&amp;#039;input&amp;#039;,  { &amp;quot;name&amp;quot; : &amp;#039;address&amp;#039;}):&lt;br /&gt;
            xml += &amp;#039;  &amp;lt;address&amp;gt;%s&amp;lt;/address&amp;gt;\n&amp;#039; % s.attrMap[&amp;#039;value&amp;#039;]&lt;br /&gt;
&lt;br /&gt;
         for s in obj.findAll(&amp;#039;input&amp;#039;,  { &amp;quot;name&amp;quot; : &amp;#039;streetNumber&amp;#039;}):&lt;br /&gt;
            xml += &amp;#039;  &amp;lt;streetNumber&amp;gt;%s&amp;lt;/streetNumber&amp;gt;\n&amp;#039; % s.attrMap[&amp;#039;value&amp;#039;]&lt;br /&gt;
&lt;br /&gt;
         for s in obj.findAll(&amp;#039;input&amp;#039;,  { &amp;quot;name&amp;quot; : &amp;#039;streetName&amp;#039;}):&lt;br /&gt;
            xml += &amp;#039;  &amp;lt;streetName&amp;gt;%s&amp;lt;/streetName&amp;gt;\n&amp;#039; % s.attrMap[&amp;#039;value&amp;#039;]&lt;br /&gt;
&lt;br /&gt;
         for s in obj.findAll(&amp;#039;input&amp;#039;,  { &amp;quot;name&amp;quot; : &amp;#039;streetType&amp;#039;}):&lt;br /&gt;
            xml += &amp;#039;  &amp;lt;streetType&amp;gt;%s&amp;lt;/streetType&amp;gt;\n&amp;#039; % s.attrMap[&amp;#039;value&amp;#039;]&lt;br /&gt;
&lt;br /&gt;
         for s in obj.findAll(&amp;#039;input&amp;#039;,  { &amp;quot;name&amp;quot; : &amp;#039;locality&amp;#039;}):&lt;br /&gt;
            xml += &amp;#039;  &amp;lt;locality&amp;gt;%s&amp;lt;/locality&amp;gt;\n&amp;#039; % s.attrMap[&amp;#039;value&amp;#039;]&lt;br /&gt;
&lt;br /&gt;
http://www.magusco.com/kb/index.php/Parsing_WhitePages_Search_Results_HTML&lt;br /&gt;
         lis       = obj.findAll(&amp;#039;li&amp;#039;,  { &amp;quot;class&amp;quot; : None})&lt;br /&gt;
&lt;br /&gt;
         for li in lis:&lt;br /&gt;
            xml += &amp;#039;  &amp;lt;li&amp;gt;%s&amp;lt;/li&amp;gt;\n&amp;#039; % li.find(text=True)&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
         for address in obj.findAll(&amp;#039;li&amp;#039;,  { &amp;quot;class&amp;quot; : &amp;quot;entryData address&amp;quot;}):&lt;br /&gt;
            xml += &amp;#039;  &amp;lt;addr&amp;gt;%s&amp;lt;/addr&amp;gt;\n&amp;#039; % address.find(text=True)&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
         for phone in obj.findAll(&amp;#039;li&amp;#039;,  { &amp;quot;class&amp;quot; : &amp;quot;entryData phoneNumber&amp;quot;}):&lt;br /&gt;
	    xml += &amp;#039;  &amp;lt;phoneNumber&amp;gt;%s&amp;lt;/phoneNumber&amp;gt;\n&amp;#039; % phone.find(text=True)&lt;br /&gt;
&lt;br /&gt;
         xml += &amp;#039;&amp;lt;/entry&amp;gt;\n\n&amp;#039;&lt;br /&gt;
&lt;br /&gt;
   return xml&lt;br /&gt;
&lt;br /&gt;
#-------------------------------------------------------------------------------------&lt;br /&gt;
&lt;br /&gt;
def test():&lt;br /&gt;
   # xml = parse(&amp;#039;html/0002.html&amp;#039;)&lt;br /&gt;
&lt;br /&gt;
   # print xml&lt;br /&gt;
&lt;br /&gt;
#-------------------------------------------------------------------------------------&lt;br /&gt;
&amp;lt;/pre&amp;gt;&lt;br /&gt;
&lt;br /&gt;
=Results of Search=&lt;br /&gt;
&lt;br /&gt;
Are written to a file as XML.&lt;br /&gt;
&lt;br /&gt;
[[Category:Python]]&lt;br /&gt;
[[Category:Python httplib]]&lt;br /&gt;
[[Category:Internet]]&lt;/div&gt;</summary>
		<author><name>PeterHarding</name></author>
	</entry>
</feed>