877 lines
No EOL
30 KiB
Python
877 lines
No EOL
30 KiB
Python
"""
|
|
Format and compress XML documents
|
|
"""
|
|
import getopt
|
|
import re
|
|
import sys
|
|
import xml.parsers.expat
|
|
|
|
__version__ = "0.2.4"
|
|
|
|
DEFAULT_BLANKS = False
|
|
DEFAULT_COMPRESS = False
|
|
DEFAULT_SELFCLOSE = False
|
|
DEFAULT_CORRECT = True
|
|
DEFAULT_INDENT = 2
|
|
DEFAULT_INDENT_CHAR = " "
|
|
DEFAULT_INLINE = True
|
|
DEFAULT_ENCODING_INPUT = None
|
|
DEFAULT_ENCODING_OUTPUT = None
|
|
DEFAULT_EOF_NEWLINE = False
|
|
|
|
|
|
class Formatter:
|
|
# Use internal encoding:
|
|
encoding_internal = None
|
|
|
|
def __init__(
|
|
self,
|
|
indent=DEFAULT_INDENT,
|
|
preserve=[],
|
|
blanks=DEFAULT_BLANKS,
|
|
compress=DEFAULT_COMPRESS,
|
|
selfclose=DEFAULT_SELFCLOSE,
|
|
indent_char=DEFAULT_INDENT_CHAR,
|
|
encoding_input=DEFAULT_ENCODING_INPUT,
|
|
encoding_output=DEFAULT_ENCODING_OUTPUT,
|
|
inline=DEFAULT_INLINE,
|
|
correct=DEFAULT_CORRECT,
|
|
eof_newline=DEFAULT_EOF_NEWLINE,
|
|
):
|
|
# Minify the XML document:
|
|
self.compress = compress
|
|
# Use self-closing tags
|
|
self.selfclose = selfclose
|
|
# Correct text nodes
|
|
self.correct = correct
|
|
# Decode the XML document:
|
|
self.encoding_input = self.enc_normalize(encoding_input)
|
|
# Encode ouput by:
|
|
self.encoding_output = self.enc_normalize(encoding_output)
|
|
# Insert indent = indent*level*indent_char:
|
|
self.indent = int(indent)
|
|
# Indent by char:
|
|
self.indent_char = indent_char
|
|
# Format inline objects:
|
|
self.inline = inline
|
|
# Don't compress this elements and their descendants:
|
|
self.preserve = preserve
|
|
# Preserve blanks lines (collapse multiple into one)
|
|
self.blanks = blanks
|
|
# Always add a newline character at EOF
|
|
self.eof_newline = eof_newline
|
|
|
|
@property
|
|
def encoding_effective(self, enc=None):
|
|
if self.encoding_output:
|
|
return self.encoding_output
|
|
elif self.encoding_internal:
|
|
return self.encoding_internal
|
|
elif self.encoding_input:
|
|
return self.encoding_input
|
|
else:
|
|
return "UTF-8"
|
|
|
|
def enc_normalize(self, string):
|
|
""" Format an Encoding identifier to upper case. """
|
|
if isinstance(string, str):
|
|
return string.upper()
|
|
return None
|
|
|
|
def enc_encode(self, strg):
|
|
""" Encode a formatted XML document in target"""
|
|
if sys.version_info > (3, 0):
|
|
return strg.encode(self.encoding_effective) # v3
|
|
return strg.decode("utf-8").encode(self.encoding_effective) # v2
|
|
|
|
def enc_output(self, path, strg):
|
|
""" Output according to encoding """
|
|
fh = sys.stdout
|
|
if strg is not None:
|
|
if path is not None:
|
|
open(path, "w+b").write(strg)
|
|
elif sys.version_info > (3, 0):
|
|
fh.buffer.write(strg)
|
|
else:
|
|
fh.write(strg)
|
|
|
|
def format_string(self, xmldoc=""):
|
|
""" Format a XML document given by xmldoc """
|
|
token_list = Formatter.TokenList(self)
|
|
token_list.parser.Parse(xmldoc)
|
|
return self.enc_encode(str(token_list))
|
|
|
|
def format_file(self, file):
|
|
""" Format a XML document given by path name """
|
|
fh = open(file, "rb")
|
|
token_list = Formatter.TokenList(self)
|
|
token_list.parser.ParseFile(fh)
|
|
fh.close()
|
|
return self.enc_encode(str(token_list))
|
|
|
|
class TokenList:
|
|
# Being in a cdata section:
|
|
cdata_section = False
|
|
# Lock deletion of leading whitespace:
|
|
desc_mixed_level = None
|
|
# Lock indenting:
|
|
indent_level = None
|
|
# Reference the Formatter:
|
|
formatter = None
|
|
# Count levels:
|
|
level_counter = 0
|
|
# Lock deletion of whitespaces:
|
|
preserve_level = None
|
|
|
|
def __init__(self, formatter):
|
|
# Keep tokens in a list:
|
|
self._list = []
|
|
self.formatter = formatter
|
|
self.parser = xml.parsers.expat.ParserCreate(
|
|
encoding=self.formatter.encoding_input
|
|
)
|
|
self.parser.specified_attributes = 1
|
|
self.parser.buffer_text = True
|
|
# Push tokens to buffer:
|
|
for pattern in [
|
|
"XmlDecl%s",
|
|
"ElementDecl%s",
|
|
"AttlistDecl%s",
|
|
"EntityDecl%s",
|
|
"StartElement%s",
|
|
"EndElement%s",
|
|
"ProcessingInstruction%s",
|
|
"CharacterData%s",
|
|
"Comment%s",
|
|
"Default%s",
|
|
"StartDoctypeDecl%s",
|
|
"EndDoctypeDecl%s",
|
|
"StartCdataSection%s",
|
|
"EndCdataSection%s",
|
|
"NotationDecl%s",
|
|
]:
|
|
setattr(
|
|
self.parser, pattern % "Handler", self.xml_handler(pattern % "")
|
|
)
|
|
|
|
def __iter__(self):
|
|
return iter(self._list)
|
|
|
|
def __len__(self):
|
|
return len(self._list)
|
|
|
|
def __getitem__(self, pos):
|
|
if 0 <= pos < len(self._list):
|
|
return self._list[pos]
|
|
else:
|
|
raise IndexError
|
|
|
|
def __setitem__(self, pos, value):
|
|
if 0 <= pos < len(self._list):
|
|
self._list[pos] = value
|
|
else:
|
|
raise IndexError
|
|
|
|
def __str__(self):
|
|
""" Returns the formatted XML document in UTF-8. """
|
|
for step in ["configure", "pre_operate", "post_operate"]:
|
|
for tk in iter(self):
|
|
getattr(tk, step)()
|
|
result = ""
|
|
for tk in iter(self):
|
|
result += str(tk)
|
|
if self.formatter.eof_newline and not result.endswith("\n"):
|
|
result += "\n"
|
|
return result
|
|
|
|
def append(self, tk):
|
|
""" Add token to tokenlist. """
|
|
tk.pos = len(self._list)
|
|
self._list.append(tk)
|
|
|
|
def level_increment(self):
|
|
""" Increment level counter. """
|
|
self.level_counter += 1
|
|
|
|
def level_decrement(self):
|
|
""" Decrement level counter. """
|
|
self.level_counter -= 1
|
|
|
|
def token_descendant_mixed(self, tk):
|
|
""" Mark descendants of mixed content. """
|
|
if tk.name == "StartElement":
|
|
# Mark every descendant:
|
|
if tk.content_model in [2, 3] and self.desc_mixed_level is None:
|
|
self.desc_mixed_level = tk.level
|
|
return False
|
|
return self.desc_mixed_level is not None
|
|
elif tk.name == "EndElement":
|
|
# Stop marking every descendant:
|
|
if tk.level is self.desc_mixed_level:
|
|
self.desc_mixed_level = None
|
|
elif self.desc_mixed_level is not None:
|
|
return True
|
|
return False
|
|
elif self.desc_mixed_level is None:
|
|
return False
|
|
return self.desc_mixed_level >= tk.level - 1
|
|
|
|
def sequence(self, tk, scheme=None):
|
|
"""Returns sublist of token list.
|
|
None: next to last
|
|
EndElement: first to previous"""
|
|
if scheme == "EndElement" or (scheme is None and tk.end):
|
|
return reversed(self._list[: tk.pos])
|
|
return self._list[(tk.pos + 1) :]
|
|
|
|
def token_indent(self, tk):
|
|
if self.formatter.inline:
|
|
return self.token_indent_inline(tk)
|
|
""" Indent outside of text of mixed content. """
|
|
if tk.name == "StartElement":
|
|
# Block indenting for descendants of text and mixed content:
|
|
if tk.content_model in [2, 3] and self.indent_level is None:
|
|
self.indent_level = tk.level
|
|
elif self.indent_level is not None:
|
|
return False
|
|
return True
|
|
elif tk.name == "EndElement":
|
|
# Unblock indenting for descendants of text and mixed content:
|
|
if tk.level == self.indent_level:
|
|
self.indent_level = None
|
|
elif self.indent_level is None:
|
|
return True
|
|
return False
|
|
return self.indent_level is None
|
|
|
|
def token_indent_inline(self, tk):
|
|
""" Indent every element content - no matter enclosed by text or mixed content. """
|
|
for itk in iter(self.sequence(tk, "EndElement")):
|
|
if itk.level < tk.level and itk.name == "StartElement":
|
|
if itk.content_model == 1:
|
|
return True
|
|
return False
|
|
if (
|
|
itk.level == tk.level
|
|
and tk.name == "EndElement"
|
|
and itk.name == "StartElement"
|
|
):
|
|
if itk.content_model == 1:
|
|
return True
|
|
return False
|
|
return True
|
|
|
|
def token_model(self, tk):
|
|
"""Returns code for content model.
|
|
0: empty
|
|
1: element
|
|
2: text
|
|
3: mixed"""
|
|
eflag = tflag = 0
|
|
for itk in iter(self.sequence(tk)):
|
|
# Element boundary found:
|
|
if itk.level <= tk.level:
|
|
break
|
|
# Direct child found:
|
|
elif (itk.level - 1) == tk.level:
|
|
if itk.start:
|
|
eflag = 1
|
|
elif itk.not_empty:
|
|
tflag = 2
|
|
return eflag + tflag
|
|
|
|
def token_preserve(self, tk):
|
|
"""Preseve eyery descendant of an preserved element.
|
|
0: not locked
|
|
1: just (un)locked
|
|
2: locked"""
|
|
# Lock perserving for StartElements:
|
|
if tk.name == "StartElement":
|
|
if self.preserve_level is not None:
|
|
return 2
|
|
if tk.arg[0] in self.formatter.preserve:
|
|
self.preserve_level = tk.level
|
|
return 1
|
|
return 0
|
|
# Unlock preserving for EndElements:
|
|
elif tk.name == "EndElement":
|
|
if (
|
|
tk.arg[0] in self.formatter.preserve
|
|
and tk.level == self.preserve_level
|
|
):
|
|
self.preserve_level = None
|
|
return 1
|
|
elif self.preserve_level is None:
|
|
return 0
|
|
return 2
|
|
return self.preserve_level is not None
|
|
|
|
def whitespace_append_trailing(self, tk):
|
|
""" Add a trailing whitespace to previous character data. """
|
|
if self.formatter.correct and tk.leading and tk.not_empty:
|
|
self.whitespace_append(tk, "EndElement", "StartElement", True)
|
|
|
|
def whitespace_append_leading(self, tk):
|
|
""" Add a leading whitespace to previous character data. """
|
|
if self.formatter.correct and tk.trailing and tk.not_empty:
|
|
self.whitespace_append(tk)
|
|
|
|
def whitespace_append(
|
|
self, tk, start="StartElement", stop="EndElement", direct=False
|
|
):
|
|
""" Add a whitspace to token list. """
|
|
for itk in self.sequence(tk, start):
|
|
if (
|
|
itk.empty
|
|
or (itk.name == stop and itk.descendant_mixed is False)
|
|
or (itk.name == start and abs(tk - itk) == 1)
|
|
):
|
|
break
|
|
elif itk.not_empty or (itk.name == start and itk.descendant_mixed):
|
|
self.insert_empty(itk, direct)
|
|
break
|
|
|
|
def whitespace_delete_leading(self, tk):
|
|
""" Returns True, if no next token or all empty (up to next end element)"""
|
|
if (
|
|
self.formatter.correct
|
|
and tk.leading
|
|
and not tk.preserve
|
|
and not tk.cdata_section
|
|
):
|
|
for itk in self.sequence(tk, "EndElement"):
|
|
if itk.trailing:
|
|
return True
|
|
elif itk.name in ["EndElement", "CharacterData", "EndCdataSection"]:
|
|
return False
|
|
return True
|
|
return False
|
|
|
|
def whitespace_delete_trailing(self, tk):
|
|
"""Returns True, if no next token or all empty (up to next end element)"""
|
|
if (
|
|
self.formatter.correct
|
|
and tk.trailing
|
|
and not tk.preserve
|
|
and not tk.cdata_section
|
|
):
|
|
for itk in self.sequence(tk, "StartElement"):
|
|
if itk.end:
|
|
return True
|
|
elif (
|
|
itk.name in ["StartElement", "StartCdataSection"]
|
|
or itk.not_empty
|
|
):
|
|
return False
|
|
return True
|
|
return False
|
|
|
|
def insert_empty(self, tk, before=True):
|
|
""" Insert an Empty Token into token list - before or after tk. """
|
|
if not (0 < tk.pos < (len(self) - 1)):
|
|
return False
|
|
ptk = self[tk.pos - 1]
|
|
ntk = self.formatter.CharacterData(self, [" "])
|
|
ntk.level = max(ptk.level, tk.level)
|
|
ntk.descendant_mixed = tk.descendant_mixed
|
|
ntk.preserve = ptk.preserve * tk.preserve
|
|
ntk.cdata_section = ptk.cdata_section or tk.cdata_section
|
|
if before:
|
|
self._list.insert(tk.pos + 1, ntk)
|
|
else:
|
|
self._list.insert(tk.pos, ntk)
|
|
for i in range((tk.pos - 1), len(self._list)):
|
|
self._list[i].pos = i
|
|
|
|
def xml_handler(self, key):
|
|
""" Returns lambda function which adds token to token list"""
|
|
return lambda *arg: self.append(getattr(self.formatter, key)(self, arg))
|
|
|
|
class Token(object):
|
|
def __init__(self, tklist, arg):
|
|
# Reference Token List:
|
|
self.list = tklist
|
|
# Token datas:
|
|
self.arg = list(arg)
|
|
# Token is placed in an CDATA section:
|
|
self.cdata_section = False
|
|
# Token has content model:
|
|
self.content_model = None
|
|
# Remove trailing wihtespaces:
|
|
self.delete_trailing = False
|
|
# Remove leading whitespaces:
|
|
self.delete_leading = False
|
|
# Token is descendant of text or mixed content element:
|
|
self.descendant_mixed = False
|
|
# Reference to formatter:
|
|
self.formatter = tklist.formatter
|
|
# Insert indenting white spaces:
|
|
self.indent = False
|
|
# N-th generation of roots descendants:
|
|
self.level = self.list.level_counter
|
|
# Token class:
|
|
self.name = self.__class__.__name__
|
|
# Preserve white spaces within enclosed tokens:
|
|
self.preserve = False
|
|
# Position in token list:
|
|
self.pos = None
|
|
|
|
def __sub__(self, other):
|
|
return self.pos - other.pos
|
|
|
|
def __unicode__(self):
|
|
return ""
|
|
|
|
# Workaround, see http://lucumr.pocoo.org/2011/1/22/forwards-compatible-python/:
|
|
if sys.version_info > (3, 0):
|
|
__str__ = lambda x: x.__unicode__()
|
|
else:
|
|
__str__ = lambda x: unicode(x).encode("utf-8")
|
|
|
|
@property
|
|
def end(self):
|
|
return self.name == "EndElement"
|
|
|
|
@property
|
|
def empty(self):
|
|
return self.name == "CharacterData" and re.match(
|
|
r"^[\t\s\n]*$", self.arg[0]
|
|
)
|
|
|
|
@property
|
|
def leading(self):
|
|
return self.name == "CharacterData" and re.search(
|
|
r"^[\t\s\n]+", self.arg[0]
|
|
)
|
|
|
|
@property
|
|
def not_empty(self):
|
|
return (
|
|
self.name == "CharacterData"
|
|
and not self.cdata_section
|
|
and not re.match(r"^[\t\s\n]+$", self.arg[0])
|
|
)
|
|
|
|
@property
|
|
def trailing(self):
|
|
return self.name == "CharacterData" and re.search(
|
|
r"[\t\s\n]+$", self.arg[0]
|
|
)
|
|
|
|
@property
|
|
def start(self):
|
|
return self.name == "StartElement"
|
|
|
|
@property
|
|
def correct(self):
|
|
return self.formatter.correct
|
|
|
|
def attribute(self, key, value):
|
|
if key and value:
|
|
return ' %s="%s"' % (key, value)
|
|
elif key:
|
|
return ' %s=""' % (key)
|
|
return ""
|
|
|
|
def indent_insert(self):
|
|
""" Indent token. """
|
|
# Child of root and no empty node
|
|
if (
|
|
self.level > 0 and not (self.end and self.list[self.pos - 1].start)
|
|
) or ( # not empty node:
|
|
self.end and not self.list[self.pos - 1].start
|
|
):
|
|
return self.indent_create(self.level)
|
|
return ""
|
|
|
|
def indent_create(self, times=1):
|
|
""" Returns indent string. """
|
|
if not self.formatter.compress and self.formatter.indent:
|
|
return "\n%s" % (
|
|
(times * self.formatter.indent) * self.formatter.indent_char
|
|
)
|
|
return ""
|
|
|
|
def identifier(self, systemid, publicid):
|
|
# TODO add base parameter:
|
|
if publicid and systemid:
|
|
return ' PUBLIC "%s" "%s"' % (publicid, systemid)
|
|
elif publicid:
|
|
return ' PUBLIC "%s"' % publicid
|
|
elif systemid:
|
|
return ' SYSTEM "%s"' % systemid
|
|
return ""
|
|
|
|
def configure(self):
|
|
""" Set token properties. """
|
|
self.descendant_mixed = self.list.token_descendant_mixed(self)
|
|
self.preserve = self.list.token_preserve(self)
|
|
self.cdata_section = self.list.cdata_section
|
|
|
|
def pre_operate(self):
|
|
pass
|
|
|
|
def post_operate(self):
|
|
pass
|
|
|
|
class AttlistDecl(Token):
|
|
def __unicode__(self):
|
|
str = self.indent_create()
|
|
str += "<!ATTLIST %s %s" % (self.arg[0], self.arg[1])
|
|
if self.arg[2] is not None:
|
|
str += " %s" % self.arg[2]
|
|
if self.arg[4] and not self.arg[3]:
|
|
str += " #REQUIRED"
|
|
elif self.arg[3] and self.arg[4]:
|
|
str += " #FIXED"
|
|
elif not self.arg[4] and not self.arg[3]:
|
|
str += " #IMPLIED"
|
|
if self.arg[3]:
|
|
str += ' "%s"' % self.arg[3]
|
|
str += ">"
|
|
return str
|
|
|
|
class CharacterData(Token):
|
|
def __unicode__(self):
|
|
str = self.arg[0]
|
|
if not self.preserve and not self.cdata_section:
|
|
# remove empty tokens always in element content!
|
|
if self.empty and not self.descendant_mixed:
|
|
if self.formatter.blanks and not self.formatter.compress and re.match(r"\s*\n\s*\n\s*", str):
|
|
str = "\n"
|
|
else:
|
|
str = ""
|
|
else:
|
|
if self.correct:
|
|
str = re.sub(r"\r\n", "\n", str)
|
|
str = re.sub(r"\r|\n|\t", " ", str)
|
|
str = re.sub(r"\s+", " ", str)
|
|
if self.delete_leading:
|
|
str = re.sub(r"^\s", "", str)
|
|
if self.delete_trailing:
|
|
str = re.sub(r"\s$", "", str)
|
|
if not self.cdata_section:
|
|
str = re.sub(r"&", "&", str)
|
|
str = re.sub(r"<", "<", str)
|
|
return str
|
|
|
|
def pre_operate(self):
|
|
self.list.whitespace_append_trailing(self)
|
|
self.list.whitespace_append_leading(self)
|
|
|
|
def post_operate(self):
|
|
self.delete_leading = self.list.whitespace_delete_leading(self)
|
|
self.delete_trailing = self.list.whitespace_delete_trailing(self)
|
|
|
|
class Comment(Token):
|
|
def __unicode__(self):
|
|
str = ""
|
|
if self.preserve in [0, 1] and self.indent:
|
|
str += self.indent_insert()
|
|
str += "<!--%s-->" % re.sub(
|
|
r"^[\r\n]+$", "\n", re.sub(r"^[\r\n]+", "\n", self.arg[0])
|
|
)
|
|
return str
|
|
|
|
def configure(self):
|
|
super(Formatter.Comment, self).configure()
|
|
self.indent = self.list.token_indent(self)
|
|
|
|
class Default(Token):
|
|
pass
|
|
|
|
class EndCdataSection(Token):
|
|
def __unicode__(self):
|
|
return "]]>"
|
|
|
|
def configure(self):
|
|
self.list.cdata_section = False
|
|
|
|
class ElementDecl(Token):
|
|
def __unicode__(self):
|
|
str = self.indent_create()
|
|
str += "<!ELEMENT %s%s>" % (self.arg[0], self.evaluate_model(self.arg[1]))
|
|
return str
|
|
|
|
def evaluate_model(self, model, modelStr="", concatStr=""):
|
|
childSeq = []
|
|
mixed = model[0] == xml.parsers.expat.model.XML_CTYPE_MIXED
|
|
hasChilds = len(model[3]) or mixed
|
|
if model[0] == xml.parsers.expat.model.XML_CTYPE_EMPTY: # 1
|
|
modelStr += " EMPTY"
|
|
elif model[0] == xml.parsers.expat.model.XML_CTYPE_ANY: # 2
|
|
modelStr += " ANY"
|
|
elif model[0] == xml.parsers.expat.model.XML_CTYPE_NAME: # 4
|
|
modelStr = "%s" % model[2] # new start
|
|
elif model[0] in (
|
|
xml.parsers.expat.model.XML_CTYPE_CHOICE,
|
|
xml.parsers.expat.model.XML_CTYPE_MIXED,
|
|
): # 5
|
|
concatStr = "|"
|
|
elif model[0] == xml.parsers.expat.model.XML_CTYPE_SEQ: # 6
|
|
concatStr = ","
|
|
if hasChilds:
|
|
modelStr += " ("
|
|
if mixed:
|
|
childSeq.append("#PCDATA")
|
|
for child in model[3]:
|
|
childSeq.append(self.evaluate_model(child))
|
|
modelStr += concatStr.join(childSeq)
|
|
if hasChilds:
|
|
modelStr += ")"
|
|
modelStr += {
|
|
xml.parsers.expat.model.XML_CQUANT_NONE: "",
|
|
xml.parsers.expat.model.XML_CQUANT_OPT: "?",
|
|
xml.parsers.expat.model.XML_CQUANT_PLUS: "+",
|
|
xml.parsers.expat.model.XML_CQUANT_REP: "*",
|
|
}[model[1]]
|
|
return modelStr
|
|
|
|
class EndDoctypeDecl(Token):
|
|
def __unicode__(self):
|
|
str = ""
|
|
if self.list[self.pos - 1].name != "StartDoctypeDecl":
|
|
str += self.indent_create(0)
|
|
str += "]"
|
|
str += ">"
|
|
str += self.indent_create(0)
|
|
return str
|
|
|
|
class EndElement(Token):
|
|
def __init__(self, list, arg):
|
|
list.level_decrement()
|
|
super(Formatter.EndElement, self).__init__(list, arg)
|
|
|
|
def __unicode__(self):
|
|
str = ""
|
|
# Don't close empty nodes on compression mode:
|
|
if (
|
|
not (self.formatter.compress or self.formatter.selfclose)
|
|
or self.list[self.pos - 1].name != "StartElement"
|
|
):
|
|
if self.preserve in [0] and self.indent:
|
|
str += self.indent_insert()
|
|
str += "</%s>" % self.arg[0]
|
|
return str
|
|
|
|
def configure(self):
|
|
self.descendant_mixed = self.list.token_descendant_mixed(self)
|
|
self.preserve = self.list.token_preserve(self)
|
|
self.indent = self.list.token_indent(self)
|
|
|
|
class EntityDecl(Token):
|
|
def __unicode__(self):
|
|
str = self.indent_create()
|
|
str += "<!ENTITY "
|
|
if self.arg[1]:
|
|
str += "% "
|
|
str += "%s " % self.arg[0]
|
|
if self.arg[2]:
|
|
str += '"%s"' % self.arg[2]
|
|
else:
|
|
str += "%s " % self.identifier(self.arg[4], self.arg[5])
|
|
if self.arg[6]:
|
|
str += "NDATA %s" % self.arg[6]
|
|
str += ">"
|
|
return str
|
|
|
|
class NotationDecl(Token):
|
|
def __unicode__(self):
|
|
str = self.indent_create()
|
|
str += "<!NOTATION %s%s>" % (
|
|
self.arg[0],
|
|
self.identifier(self.arg[2], self.arg[3]),
|
|
)
|
|
return str
|
|
|
|
class ProcessingInstruction(Token):
|
|
def __unicode__(self):
|
|
str = ""
|
|
if self.preserve in [0, 1] and self.indent:
|
|
str += self.indent_insert()
|
|
str += "<?%s %s?>" % (self.arg[0], self.arg[1])
|
|
return str
|
|
|
|
def configure(self):
|
|
super(Formatter.ProcessingInstruction, self).configure()
|
|
self.indent = self.list.token_indent(self)
|
|
|
|
class StartCdataSection(Token):
|
|
def __unicode__(self):
|
|
return "<![CDATA["
|
|
|
|
def configure(self):
|
|
self.list.cdata_section = True
|
|
|
|
class StartDoctypeDecl(Token):
|
|
def __unicode__(self):
|
|
str = "<!DOCTYPE %s" % (self.arg[0])
|
|
if self.arg[1]:
|
|
str += self.identifier(self.arg[1], self.arg[2])
|
|
if self.arg[3]:
|
|
str += " ["
|
|
return str
|
|
|
|
class StartElement(Token):
|
|
def __init__(self, list, arg):
|
|
super(Formatter.StartElement, self).__init__(list, arg)
|
|
self.list.level_increment()
|
|
|
|
def __unicode__(self):
|
|
str = ""
|
|
if self.preserve in [0, 1] and self.indent:
|
|
str += self.indent_insert()
|
|
str += "<%s" % self.arg[0]
|
|
for attr in sorted(self.arg[1].keys()):
|
|
str += self.attribute(attr, self.arg[1][attr])
|
|
if self.list[self.pos + 1].end and (self.formatter.compress or self.formatter.selfclose):
|
|
str += "/>"
|
|
else:
|
|
str += ">"
|
|
return str
|
|
|
|
def configure(self):
|
|
self.content_model = self.list.token_model(self)
|
|
self.descendant_mixed = self.list.token_descendant_mixed(self)
|
|
self.preserve = self.list.token_preserve(self)
|
|
self.indent = self.list.token_indent(self)
|
|
|
|
class XmlDecl(Token):
|
|
def __init__(self, list, arg):
|
|
super(Formatter.XmlDecl, self).__init__(list, arg)
|
|
if len(self.arg) > 1:
|
|
self.formatter.encoding_internal = self.arg[1]
|
|
|
|
def __unicode__(self):
|
|
str = "<?xml%s%s" % (
|
|
self.attribute("version", self.arg[0]),
|
|
self.attribute("encoding", self.formatter.encoding_effective),
|
|
)
|
|
if self.arg[2] > -1:
|
|
str += self.attribute("standalone", "yes")
|
|
str += "?>\n"
|
|
return str
|
|
|
|
|
|
def cli_usage(msg=""):
|
|
""" Output usage for command line tool. """
|
|
sys.stderr.write(msg + "\n")
|
|
sys.stderr.write(
|
|
'Usage: xmlformat [--preserve "pre,literal"] [--blanks]\
|
|
[--compress] [--selfclose] [--indent num] [--indent-char char]\
|
|
[--outfile file] [--encoding enc] [--outencoding enc]\
|
|
[--disable-inlineformatting] [--overwrite] [--disable-correction]\
|
|
[--eof-newline]\
|
|
[--help] <--infile file | file | - >\n'
|
|
)
|
|
sys.exit(2)
|
|
|
|
|
|
def cli():
|
|
""" Launch xmlformatter from command line. """
|
|
res = None
|
|
indent = DEFAULT_INDENT
|
|
indent_char = DEFAULT_INDENT_CHAR
|
|
outfile = None
|
|
overwrite = False
|
|
preserve = []
|
|
blanks = False
|
|
compress = DEFAULT_COMPRESS
|
|
selfclose = DEFAULT_SELFCLOSE
|
|
infile = None
|
|
encoding = DEFAULT_ENCODING_INPUT
|
|
outencoding = DEFAULT_ENCODING_OUTPUT
|
|
inline = DEFAULT_INLINE
|
|
correct = DEFAULT_CORRECT
|
|
eof_newline = DEFAULT_EOF_NEWLINE
|
|
try:
|
|
opts, args = getopt.getopt(
|
|
sys.argv[1:],
|
|
"",
|
|
[
|
|
"compress",
|
|
"selfclose",
|
|
"disable-correction",
|
|
"disable-inlineformatting",
|
|
"encoding=",
|
|
"help",
|
|
"infile=",
|
|
"indent=",
|
|
"indent-char=",
|
|
"outfile=",
|
|
"outencoding=",
|
|
"overwrite",
|
|
"preserve=",
|
|
"blanks",
|
|
"eof-newline"
|
|
],
|
|
)
|
|
except getopt.GetoptError as err:
|
|
cli_usage(str(err))
|
|
for key, value in opts:
|
|
if key in ["--indent"]:
|
|
indent = value
|
|
elif key in ["--preserve"]:
|
|
preserve = value.replace(",", " ").split()
|
|
elif key in ["--blanks"]:
|
|
blanks = True
|
|
elif key in ["--help"]:
|
|
cli_usage()
|
|
elif key in ["--compress"]:
|
|
compress = True
|
|
elif key in ["--selfclose"]:
|
|
selfclose = True
|
|
elif key in ["--outfile"]:
|
|
outfile = value
|
|
elif key in ["--infile"]:
|
|
infile = value
|
|
elif key in ["--encoding"]:
|
|
encoding = value
|
|
elif key in ["--outencoding"]:
|
|
outencoding = value
|
|
elif key in ["--indent-char"]:
|
|
indent_char = value
|
|
elif key in ["--disable-inlineformatting"]:
|
|
inline = False
|
|
elif key in ["--disable-correction"]:
|
|
correct = False
|
|
elif key in ["--overwrite"]:
|
|
overwrite = True
|
|
elif key in ["--eof-newline"]:
|
|
eof_newline = True
|
|
try:
|
|
formatter = Formatter(
|
|
indent=indent,
|
|
preserve=preserve,
|
|
blanks=blanks,
|
|
compress=compress,
|
|
selfclose=selfclose,
|
|
encoding_input=encoding,
|
|
encoding_output=outencoding,
|
|
indent_char=indent_char,
|
|
inline=inline,
|
|
correct=correct,
|
|
eof_newline=eof_newline,
|
|
)
|
|
input_file = None
|
|
if infile:
|
|
input_file = infile
|
|
res = formatter.format_file(input_file)
|
|
elif len(args) > 0:
|
|
if args[0] == "-":
|
|
res = formatter.format_string("".join(sys.stdin.readlines()))
|
|
else:
|
|
input_file = args[0]
|
|
res = formatter.format_file(input_file)
|
|
|
|
except xml.parsers.expat.ExpatError as err:
|
|
cli_usage("XML error: %s" % err)
|
|
except IOError as err:
|
|
cli_usage("IO error: %s" % err)
|
|
except:
|
|
cli_usage("Unkonwn error")
|
|
|
|
if overwrite:
|
|
formatter.enc_output(input_file, res)
|
|
else:
|
|
formatter.enc_output(outfile, res) |