Difference between revisions of "Python - XML Processing"
Jump to navigation
Jump to search
PeterHarding (talk | contribs) (New page: =Using XML Module (xml.dom.utils) to Parse XML= Here is a bit of code I used recently for pulling apart a chunk of XML (a SOAP request). <pre> #!/usr/bin/env python #--------------------...) |
PeterHarding (talk | contribs) |
||
| Line 289: | Line 289: | ||
#----- Recurse node ------------------------------------------------------- | #----- Recurse node ------------------------------------------------------- | ||
""" | """ | ||
Node: esb:SvcName | Node: esb:SvcName receiveBilling | ||
Node: esb:SvcVersion 1.0 | Node: esb:SvcVersion 1.0 | ||
Node: esb:RequestDt 2007-06-20T11:36:00 | Node: esb:RequestDt 2007-06-20T11:36:00 | ||
| Line 298: | Line 298: | ||
Node: ns0:sourceSystemID AAE PSft | Node: ns0:sourceSystemID AAE PSft | ||
Node: ns0:sequenceID 1 | Node: ns0:sequenceID 1 | ||
Node: ns0:notificationEmail | Node: ns0:notificationEmail xxx@xxx.com.au | ||
Node: ns0:apAccountNumber | Node: ns0:apAccountNumber 5322201 | ||
Node: ns0:cdpAccountNumber | Node: ns0:cdpAccountNumber 2922230 | ||
Node: ns0:cdpCustomerID | Node: ns0:cdpCustomerID 2822275 | ||
Node: ns0:cdpCustomerName | Node: ns0:cdpCustomerName XXXXX | ||
Node: ns0:orderID PH0000000097 | Node: ns0:orderID PH0000000097 | ||
Node: ns0:orderDateTime 2003-12-31T12:00:00.000+10:00 | Node: ns0:orderDateTime 2003-12-31T12:00:00.000+10:00 | ||
Revision as of 14:59, 21 April 2008
Using XML Module (xml.dom.utils) to Parse XML
Here is a bit of code I used recently for pulling apart a chunk of XML (a SOAP request).
#!/usr/bin/env python
#--------------------------------------------------------------------------
import re
import sys
import getopt
import pprint
import types
import xml.dom.minidom
import ebdm
# from xml.dom.utils import FileReader
from string import join, split
#--------------------------------------------------------------------------
"""
Read in a DOM instance, convert it to a Python object
"""
__version__ = '1.0.0'
debug_flg = False
verbose_flg = False
pp = pprint.PrettyPrinter(indent=3)
regex = re.compile("\\n *")
filename = '0001.xml'
service = None
cdp = None
customer = None
order = None
in_cdp = False
in_order = False
in_address = False
no_orders = 0
curr_obj_type = None
obj = None
hdr_flg = {}
#==========================================================================
class Obj:
pass
def __init__(self):
pass
def __str__(self):
s = ''
for key in self.__dict__:
x = getattr(self, key)
s += "%s -> %s\n" % (key, x)
return s
#==========================================================================
class Service:
pass
def __init__(self):
pass
#==========================================================================
class CDP:
pass
def __init__(self):
self.customers = []
#==========================================================================
customer_attr = [
'apAccountNumber',
'cdpAccountNumber',
'cdpCustomerID',
'cdpCustomerName'
]
class Customer:
pass
def __init__(self):
self.orders = []
def dump_header(self, fd):
fd.write("[Customer],")
for i in range(len(customer_attr)):
fd.write("%s," % customer_attr[i])
fd.write("\n")
def dump_data(self, fd):
for i in range(len(customer_attr)):
fd.write("%s," % getattr(self, customer_attr[i]))
fd.write("\n")
#==========================================================================
order_attr = [
'orderID',
'orderStatus',
'orderDateTime',
'readyDateTime',
'invoiceDateTime',
'numberOfItems',
'workCentreID',
'customerCostCentre',
'orderRef1',
'orderRef1Desc',
'orderRef2',
'orderRef2Desc',
'orderRef3',
'orderRef3Desc',
'otherCustomerRef',
'otherCustomerRefDesc',
'address1',
'address2',
'address3',
'suburb',
'state',
'postcode',
'entryWeight',
'cubedWeight',
'actualWeight',
'height',
'length',
'width',
'serviceType',
'serviceArea',
'serviceCodeSet',
'serviceCode',
'serviceDesc',
'apProductCode',
'transactionID',
'transactionType',
'chargeType',
'chargeCode',
'chargeCodeDesc',
'priceCharged',
'taxCode'
]
attr = {
'orderID' : 'Order',
'orderStatus' : 'IGNORE',
'orderDateTime' : 'IGNORE',
'readyDateTime' : 'IGNORE',
'invoiceDateTime' : 'IGNORE',
'numberOfItems' : 'Order',
'workCentreID' : 'Order',
'customerCostCentre' : 'Order',
'orderRef1' : 'CustRef',
'orderRef1Desc' : 'CustRef',
'orderRef2' : 'CustRef',
'orderRef2Desc' : 'CustRef',
'orderRef3' : 'CustRef',
'orderRef3Desc' : 'CustRef',
'otherCustomerRef' : 'CustRef',
'otherCustomerRefDesc' : 'CustRef',
'address1' : 'Address',
'address2' : 'Address',
'address3' : 'Address',
'suburb' : 'Address',
'postcode' : 'Address',
'state' : 'Address',
'entryWeight' : 'Weight',
'cubedWeight' : 'Weight',
'actualWeight' : 'Weight',
'height' : 'Dimension',
'length' : 'Dimension',
'width' : 'Dimension',
'schedPickUpDateTime' : 'IGNORE',
'pickUpTimeliness' : 'IGNORE',
'schedDeliveryDateTime' : 'IGNORE',
'deliveryTimeliness' : 'IGNORE',
'pickupSignatureRequired' : 'IGNORE',
'pickupSignatureReceived' : 'IGNORE',
'actualPickUpDateTime' : 'IGNORE',
'latePickUpIndicator' : 'IGNORE',
'deliverySignatureRequired' : 'IGNORE',
'deliverySignatureReceived' : 'IGNORE',
'actualDeliveryDateTime' : 'IGNORE',
'lateDeliveryIndicator' : 'IGNORE',
'serviceType' : 'Service',
'serviceArea' : 'Service',
'serviceCodeSet' : 'Service',
'serviceCode' : 'Service',
'serviceDesc' : 'Service',
'apProductCode' : 'Service',
'transactionID' : 'Charge',
'transactionType' : 'Charge',
'chargeType' : 'Charge',
'chargeCode' : 'Charge',
'chargeCodeDesc' : 'Charge',
'priceCharged' : 'Charge',
'taxCode' : 'Charge'
}
objs = {
'Order' : [],
'CustRef' : [],
'Address' : [],
'Weight' : [],
'Dimension' : [],
'Service' : [],
'Charge' : []
}
class Order:
orderID = None
orderStatus = None
orderDateTime = None
readyDateTime = None
invoiceDateTime = None
numberOfItems = None
workCentreID = None
customerCostCentre = None
otherCustomerRef = None
otherCustomerRefDesc = None
orderRef1 = None
orderRef1Desc = None
orderRef2 = None
orderRef2Desc = None
orderRef3 = None
orderRef3Desc = None
address1 = None
address2 = None
address3 = None
suburb = None
postcode = None
state = None
entryWeight = None
cubedWeight = None
actualWeight = None
height = None
length = None
width = None
serviceType = None
serviceArea = None
serviceCodeSet = None
serviceCode = None
serviceDesc = None
apProductCode = None
transactionID = None
transactionType = None
chargeType = None
chargeCode = None
chargeCodeDesc = None
priceCharged = None
taxCode = None
def __init__(self):
pass
def dump_header(self, fd):
for i in range(len(order_attr)):
fd.write("%s," % order_attr[i])
fd.write("\n")
def dump_data(self, fd):
for i in range(len(order_attr)):
fd.write("%s," % getattr(self, order_attr[i]))
fd.write("\n")
#----- Recurse node -------------------------------------------------------
"""
Node: esb:SvcName receiveBilling
Node: esb:SvcVersion 1.0
Node: esb:RequestDt 2007-06-20T11:36:00
Node: esb:ComponentID Test
Node: esb:ComponentName Test Application
Node: ns0:cdpID AaE
Node: ns0:consortiumID APCBE
Node: ns0:sourceSystemID AAE PSft
Node: ns0:sequenceID 1
Node: ns0:notificationEmail xxx@xxx.com.au
Node: ns0:apAccountNumber 5322201
Node: ns0:cdpAccountNumber 2922230
Node: ns0:cdpCustomerID 2822275
Node: ns0:cdpCustomerName XXXXX
Node: ns0:orderID PH0000000097
Node: ns0:orderDateTime 2003-12-31T12:00:00.000+10:00
"""
def capture(node_name, value):
global service, cdp, customer, order
global in_cdp, in_customer, in_order, no_orders
global obj
if not service:
service = Service()
if not cdp:
cdp = CDP()
if re.match('esb:', node_name):
(name, cnt) = re.subn('esb:', '', node_name)
setattr(service, name, value)
print "service.__dict__ -> '%s'" % service.__dict__
return
if re.match('ns0:', node_name):
(name, cnt) = re.subn('ns0:', '', node_name)
print ">>> name : %s" % name
if re.match('apAccountNumber', name):
in_cdp = False
in_customer = True
in_order = False
customer = Customer()
print "New customer!"
cdp.customers.append(customer)
setattr(customer, name, value)
# print customer.__dict__
return
if re.match('orderID', name):
in_cdp = False
in_customer = False
in_order = True
no_orders += 1
order = Order()
obj = Obj()
obj.type = 'Order'
objs['Order'].append(obj)
setattr(obj, name, value)
customer.orders.append(order)
setattr(order, name, value)
# print order.__dict__
return
if re.match('cdpID', name):
in_cdp = True
setattr(cdp, name, value)
# print cdp.__dict__
return
if in_cdp:
setattr(cdp, name, value)
print "cdp.__dict__ -> '%s'" % cdp.__dict__
return
if in_customer:
setattr(customer, name, value)
print customer.__dict__
return
if in_order:
setattr(order, name, value)
type = attr[name]
if type == obj.type:
# add this attribute.
setattr(obj, name, value)
else:
# new object - saveold one and create new one
if type == 'IGNORE':
return
obj = Obj()
obj.type = type
objs[type].append(obj)
# print order.__dict__
return
#----- Recurse node -------------------------------------------------------
def recurse_node(node):
"""\
i is either an IntType or a LambdaType, m is
either an instance based on of myType (or a
sub-type of myType or None). Any
parameter can have None as a valid type
unless it is disallowed via a require block.
Both arguments are required."""
# print "[recurse_node] START"
if node.__dict__.has_key('nodeName'):
node_name = node.nodeName
else:
# print "Node name not defined"
node_name = False
if node.__dict__.has_key('childNodes'):
nodes = node.childNodes
else:
nodes = None
# display_node(node)
if not nodes:
if node.__dict__.has_key('nodeValue'):
value = node.nodeValue
matched = regex.search(value)
if matched:
value = re.sub(r' *\n *', '', value)
if verbose_flg: print ">>>>>>> Bogus text node!"
return
if not node_name:
return
no_nodes = len(nodes)
if no_nodes == 1:
if nodes[0].nodeName == '#text':
value = nodes[0].nodeValue
if verbose_flg: print "===== Node: %-30s %s" % (node_name, value)
capture(node_name, value)
else:
if verbose_flg: print "@@@@@ Node: %-30s" % node_name
for node in nodes:
if not display_node(node):
continue
recurse_node(node)
#----- Display node -------------------------------------------------------
def display_node(node):
# print "[display_node] ENTER"
if node.nodeName == '#text':
if node.__dict__.has_key('nodeValue'):
value = node.nodeValue
# print "[display_node] Node Value: [%s]" % value
matched = regex.search(value)
if matched:
value = re.sub(r'\n *', 'N/A', value)
# print "[display_node] RETURN False - %s" % value
return False
# print "[display_node] Node Name: [%s]" % node.nodeName
else:
pass
# print "pass..."
if verbose_flg:
print "===== node ====================================================="
print "Node Name: [%s]" % node.nodeName
print "----------------------------------------------------------------"
pp.pprint(node.__dict__)
print "\n"
# print "[display_node] RETURN True"
return True
#----- Usage --------------------------------------------------------------
def clean(filename):
ifd = open(filename, 'r')
data = ifd.read()
data = data.replace(chr(0240), '')
data = data.replace(chr(0302), '')
out = open('parse.xml', 'w')
out.write(data)
ifd.close()
out.close()
#----- Usage --------------------------------------------------------------
def usage():
USAGE = """\
$ ./parse_order.py [-f <file>]
"""
sys.stderr.write(USAGE)
#----- Main ---------------------------------------------------------------
def parse(filename):
clean(filename) # -> parse.xml
dom_obj = xml.dom.minidom.parse('parse.xml')
#-----------------------------------------------------------------
# print "===== dom_obj.__dict__ =========================================\n"
# pp.pprint(dom_obj.__dict__)
# print "\n"
# print "===== dom_obj.childNodes: ======================================\n"
# pp.pprint(dom_obj.childNodes[0].__dict__)
# print "\n"
recurse_node(dom_obj.childNodes[0])
print 'Processed %d orders' % no_orders
if debug_flg:
ofd = sys.stdout
else:
ofd = open('orders.csv', 'w')
Customer().dump_header(ofd)
for customer in cdp.customers:
print "customer.__dict__ -> '%s'" % customer.__dict__
customer.dump_data(ofd)
Order().dump_header(ofd)
for order in customer.orders:
order.dump_data(ofd)
ofd.close()
#---------------------------------------------------------------------
def dump():
for obj_type in objs.keys():
print obj_type
ofp = open('dat/%s/%s.dat' % (cdp.cdpID, obj_type), 'w')
obj_list = objs[obj_type]
for obj in obj_list:
if not hdr_flg.has_key(obj_type):
hdr_flg[obj_type] = 1
hdr = ''
for attr in obj.__dict__.keys():
if (attr != 'type'):
hdr += '%s,' % attr
hdr += '\n'
ofp.write(hdr)
line = ''
for attr in obj.__dict__.keys():
val = getattr(obj, attr)
if (val != obj_type):
line += '"%s",' % getattr(obj, attr)
line += '\n'
ofp.write(line)
print obj
#--------------------------------------------------------------------------
def usage():
USAGE = """
Usage:
$ dt.py
"""
sys.stderr.write(USAGE)
#---------------------------------------------------------------------
def main(argv):
global debug_flg
global verbose_flg
global filename
global pp
#----- Process command line arguments ----------------------------
try:
opts, args = getopt.getopt(argv, "dhf:sv", ["debug", "help", "file=", "stdout", "verbose"])
except getopt.GetoptError:
usage()
sys.exit(2)
else:
for opt, arg in opts:
if opt in ("-d", "--debug"):
debug_flg = True
elif opt in ("-h", "--help"):
usage()
sys.exit(0)
elif opt in ("-f", "--file"):
filename = arg
elif opt in ("-s", "--stdout"):
stdout_flg = True
elif opt in ("-v", "--verbose"):
verbose_flg = True
parse(filename)
dump()
#---------------------------------------------------------------------
if __name__ == "__main__":
main(sys.argv[1:])
#---------------------------------------------------------------------