Alexandro,
In case this can help you, Here an example of a SAX parsing
python script
using 'zipfile' and 'xml.sax'
It works fine with OpenOffice and Word XML/Word docx
formats.
I do not know what would be the benefits of using the PyUNO
module, but I
would be pleased to learn more on this.
Laurent
#!/usr/bin/python
# -*- coding: latin-1 -*-
#-----------------------------------------------------------
------------------
# (#) <TRAMway> converter for WordML (XML and
docx) an OpenOffice.org
file
#-----------------------------------------------------------
------------------
"""
Three mode usage are available
1/ Command line; use options and arguments following the
getopt standard
2/ CGI, install the script on a Web server
3/ Python API, call API from any other code
API:
xml2tr.convert(option_tag,argument_tab)
an option element is a couple (option name, option value)
Type -h option for option list
"""
__version__ = '$Id: xml2tr.py 40 2007-01-08 16:26:04Z
saintax $'
from xml.sax import make_parser, handler, saxutils,
parseString
import string
import sys
import re
import os
import md5
import getopt # command line
import cgi # web
import zipfile
def usage():
op = {'h':['help', 'This text', False],
'v':['version', 'Display Tool Version', False],
'o':['output=', 'Ouput in a file instead of
standard output',
False],
}
sys.stderr.write("""USAGE: %s [options]
input_file...n
options:
-i string
--identification string
tDefines the regular expression matching the name of the
paragraph style
to capture Requirement Identification.
tDefault string is 'reqidentification'n
-d string
--description string
tDefines the regular expression matching the name of the
paragraph style
to capture Requirement Description.
tDefault string is 'reqdescription'n
-r string
--reference string
tDefines the regular expression matching the name of the
paragraph style
to capture Requirement Reference.
tDefault string is 'reqreference'n
-a string
--attribute string
tDefines the regular expression matching the name of the
paragraph style
to capture Requirement Reference.
tDefault string is 'reqreference'n
-p string
--prefix string
tAdd a prefix string to all detected requirement ids.
tDefault string is emptyn
-o string
--output string
tSpecifies an output file name.
tDefault is the standard output.n
Note:
nregular expression is not case sensitive and style name
can have prefix
or suffix
input_file:
tWord 2003 XML or Opendocument file format
(WordML)n"""%(sys.argv[0]))
sys.exit()
#__________#
class param:
def __init__(self,val):
self.value = val
#____________________#
class cmdFieldStorage:
def __init__(self):
self.dico = {}
def has_key(self, key):
return self.dico.has_key(key)
def add(self,key,val):
self.dico[key] = param(val)
def addlist(self,key,l):
self.dico[key] = l
def __getitem__(self, key):
return self.dico[key]
def getlist(self, key):
if self.dico.has_key(key):
return self.dico[key]
else:
return []
#__________________#
def header_appli(sum):
return """<?xml
version="1.0" encoding="iso-8859-1"
standalone="yes"?>
<?topcased-application progid="TRAMway"?>
<?xml-stylesheet href="neutral.css"
type="text/css"?>
<tramway xmlns="http://www.w3.org/
1999/xhtml" md5="%s">
<title>TRAMway converter [%s]
%s</title>n"""%(sum,os.path.basename(s
ys.argv[0]),__version__)
#__________________#
def footer_appli():
return '</tramway>n'
#_______________________________________#
class docHandler(handler.ContentHandler):
def __init__(self,dico,fdo):
self.intext = False
self.buffer = ''
self.style = ''
self.id = ''
self.sd = ''
self.seen_id = {}
self.reflist = ''
self.desc = ''
self.fo = dico
self.keywords = '([:
#\/^$,;?!& )({}=+|]|delete|derived|double|known|bad_id|co
vered|refined|resolved)'
self.section = ''
self.level = {}
#self.secid = []
self.attribute = {}
self.o = fdo
self.tl = {}
self.auto = {}
def startDocument(self):
#print self.fo['i'].value
pass
def endDocument(self):
self.record()
self.newsection(1,True)
def characters(self,str):
if self.intext:
str = saxutils.escape(str)
self.buffer += str.encode('latin-1','replace')
def startElement(self, name, attrs):
if name == 'style:style':
Pauto =
attrs.get('style:name').encode('latin-1')
if re.match(r'Pd+', Pauto):
self.auto[Pauto] =
attrs.get('style:parent-style-name').encode('latin-1')
if name == 'text:p':
self.intext = True
self.buffer = ''
self.style =
attrs.get('text:style-name').encode('latin-1')
if self.auto.has_key(self.style):
self.style = self.auto[self.style]
if name == 'w:t':
self.intext = True
elif name == 'w:p':
self.style = 'none'
self.buffer = ''
elif name == 'w:pStyle':
self.style =
attrs.get('w:val').encode('latin-1')
elif name == 'w:style':
self.section =
attrs.get('w:styleId').encode('latin-1')
elif name == 'w:outlineLvl':
self.level[self.section] =
int(attrs.get('w:val').encode('latin-1'))+1
def endElement(self, name):
self.intext = False
self.buffer = string.strip(self.buffer)
if ((name == 'w:p') or (name == 'text:p')) and
self.buffer:
if re.match(r'.*%s'%(self.fo['i'].value),
self.style,
re.IGNORECASE):
self.record() # record last requirement
# new req
self.desc = ''
mat =
re.match(r'^s*(.+)s*:s*(.+)s*$',self.buffer)
self.id = self.fo['p'].value
if mat:
self.id += mat.group(1)
self.sd = '
shortdescription=%s'%(saxutils.quoteattr(mat.group(2)))
else:
self.id += self.buffer
self.sd = ''
if self.seen_id.has_key(self.id):
sys.stderr.write('ERROR OUBLE:%s
'%(self.id))
self.id += '_DEFINED_SEVERAL_TIMES'
else:
self.seen_id[self.id] = True
elif re.match(r'.*%s'%(self.fo['d'].value),
self.style,
re.IGNORECASE):
self.desc += 'n' + self.buffer
elif re.match(r'.*%s'%(self.fo['r'].value),
self.style,
re.IGNORECASE):
if self.reflist:
self.reflist += '|' + self.buffer
else:
self.reflist = self.buffer
elif re.match(r'.*%s'%(self.fo['a'].value),
self.style,
re.IGNORECASE):
mat =
re.match(r'^s*(.+)s*:s*(.+)s*$',self.buffer)
if mat:
self.attribute[mat.group(1)] =
mat.group(2)
elif self.level.has_key(self.style):
self.newsection(self.level[self.style])
def newsection(self,lvl,end=False):
for i in self.tl.keys():
if i >= lvl:
self.o[0] += '</section>n'
del self.tl[i]
self.tl[lvl] = True
if not end:
#self.o[0] += '<section level="%d"
shortdescription=%s>n'%(lvl,saxutils.quoteattr(self.buff
er))
self.o[0] += '<section
shortdescription=%s>n'%(saxutils.quoteattr(self.buffer))
def record(self):
if re.match(r'.*#.*'+self.keywords+'.*', self.id,
re.IGNORECASE):
sys.stderr.write('ERROR:BAD_ID:%s'%(self.id))
elif self.id and re.match(r'^[swd_-]*$', self.id,
re.IGNORECASE):
self.o[0] += 'n<requirement
id=%s%s>n'%(saxutils.quoteattr(self.id),self.sd)
self.o[0] += self.desc + 'n'
if self.attribute:
for i in self.attribute.keys():
self.o[0] += '<attribute name=%s
value=%s/>n'%(saxutils.quoteattr(i),saxutils.quoteattr(s
elf.attribute[i]))
if self.reflist:
for i in self.reflist.split('|'):
self.o[0] += '<reference
type="cover" id="%s"/>n'
%(i)
self.o[0] += '</requirement>n'
self.id = ''
self.desc = ''
self.reflist = ''
self.attribute = {}
#__________________#
def get_cgi_param():
return """<form
enctype="multipart/form-data"
method="post">
<p> Input file <input type="file"
name="arg"/></p>
<p> Option -i <input name="i"
value="reqidentification"/></p>
<p> Option -d <input name="d"
value="reqdescription"/></p>
<p> Option -r <input name="r"
value="reqreference"/></p>
<p> Option -a <input name="a"
value="reqattribute"/></p>
<input type="submit"
value="send"/>
</form>"""
#____________________#
def convert(opts,str):
op = {'h':['help', 'This text', False],
'v':['version', 'Display Tool Version', False],
'w':['web', 'HTML format output instead of
text', False],
'o':['output=', 'Ouput in a file instead of
standard output',
False],
}
sys.stderr = sys.stdout
acgi = False
if False and os.environ.has_key('HTTP_USER_AGENT') and
(opts==[]):
acgi = True
fo = cgi.FieldStorage()
cgi_header = 'Content-type: application/xhtml+xml'
else:
fo = cmdFieldStorage()
fo.add('i','(reqidentification|id)')
fo.add('d','(reqdescription|requirement)')
fo.add('r','reqreference')
fo.add('a','reqattribute')
fo.add('p','')
fd = sys.stdout
for o, a in opts:
if o in ('-h', '--help'):
usage()
elif o in ('-v', '--version'):
print '%s v %s'%(sys.argv[0],__version__)
sys.exit()
elif o in ('-i', '--identification'):
fo['i'].value = a
elif o in ('-d', '--description'):
fo['d'].value = a
elif o in ('-r', '--reference'):
fo['r'].value = a
elif o in ('-a', '--attribute'):
fo['a'].value = a
elif o in ('-p', '--prefix'):
fo['p'].value = a
elif o in ('-o', '--output'):
fd = open (a,'w')
sys.stdout = fd
sum = md5.new()
sum.update(str)
out = ''
if acgi:
out += cgi_header+'nn'
out += header_appli(sum.hexdigest())
if acgi and (not fo.has_key('i')):
out += get_cgi_param()
#if fo.has_key('arg'):
# args.append(fo['arg'].file)
testout = ['']
if str:
dh = docHandler(fo,testout)
parseString(str,dh)
out += testout[0]
out += footer_appli()
if fd != sys.stdout:
fd.close()
return out
#_________________________#
def convert_zip(opts,f,content):
zip = zipfile.ZipFile(f)
print convert(opts,zip.read(content))
zip.close()
#________________________#
if __name__ == '__main__':
try:
opts, args = getopt.getopt(
sys.argv[1:], 'hi:d:r:a:p:o:',
['help',
'identification=','description=','reference=','attribute=','
prefix','output='])
except getopt.GetoptError:
usage()
str,typ = '',''
if (len(args) == 1) and os.path.isfile(args[0]):
if args[0][-4:] == '.odt':
convert_zip(opts,args[0],'content.xml')
elif args[0][-5:] == '.docx':
convert_zip(opts,args[0],'word/document.xml')
else:
str = open(args[0]).read()
mat =
re.match(r'.*progid=(.*)?>.*$',str.split('n')[1])
if mat:
typ = mat.group(1)[1:-1]
if typ == 'Word.Document':
sys.stderr.write('word')
print convert(opts,str)
elif typ == 'TRAMway':
sys.stderr.write('TRAMway')
print str
else:
sys.stderr.write('ERROR Not supported
format')
# Test
# example of API call without using getopt
#print
xml2tr.convert([('-i','requirementId'),('-d','requirementTex
t')],[sys.argv[1]])
Laurent
Alexandro Colorado <jza openoffice.org>
29/03/2007 20:11
Veuillez répondre à
dev xml.openoffice.org
A
dev xml.openoffice.org
cc
Objet
[xml-dev] PyUNO Sax Module
So I am working on a script that parse ODF's XML. I have no
problem
running it on Python 2.4 (my system python). However I cant
seem to
work it on Pyuno (OOo's Python).
The script uses saxutils from the sax module. I made a diff
on both
modules and couldnt find much difference. I wonder if anyone
here have
used python's sax module and if they are able to run the
'DefaultHandler' class from the saxutils library.
I have googled and found similar people with the problem. I
found that
ContentHandler might be an alternative. I am also reading
the
doucmentation however I can't see to find anything related
with the
DefaultHandler. Maybe I am looking somewhere else, anywhere
here is
the link to those modules:
PyUNO: http://www.python.org/doc/2.3.4/lib/module-xml.sax.html
a>
Python2.4: http://www.python.org/doc/2.4.3/lib/module-xml.sax.html
a>
------------------------------------------------------------
---------
To unsubscribe, e-mail: dev-unsubscribe xml.openoffice.org
For additional commands, e-mail: dev-help xml.openoffice.org
|