From b0615afdd9f0aefbd6582df34c9f3eab309e7803 Mon Sep 17 00:00:00 2001
From: dlichteblau
Date: Thu, 29 Dec 2005 00:31:30 +0000
Subject: [PATCH] whitespace normalizer
---
README.html | 3 +-
cxml.asd | 1 +
doc/dom.html | 2 +-
doc/installation.html | 4 +--
doc/quickstart.html | 2 +-
doc/using.html | 31 +++++++++++++++++++-
doc/xmls-compat.html | 2 +-
runes/ystream.lisp | 1 +
xml/package.lisp | 1 +
xml/recoder.lisp | 6 +++-
xml/sax-handler.lisp | 25 +++++++++++++++-
xml/sax-proxy.lisp | 3 +-
xml/space-normalizer.lisp | 62 +++++++++++++++++++++++++++++++++++++++
xml/xml-parse.lisp | 3 +-
14 files changed, 135 insertions(+), 11 deletions(-)
create mode 100644 xml/space-normalizer.lisp
diff --git a/README.html b/README.html
index 184555e..f11b49e 100644
--- a/README.html
+++ b/README.html
@@ -23,7 +23,7 @@
@@ -112,6 +112,7 @@ Recent Changes
UTF-8 string support in DOM on Lisps without Unicode characters.
Sink API has been changed.
Support internal subset serialization.
+ Whitespace normalizer.
Gilbert Baumann has clarified the license as Lisp-LGPL.
Use trivial-gray-streams.
diff --git a/cxml.asd b/cxml.asd
index 43a6647..4b85a45 100644
--- a/cxml.asd
+++ b/cxml.asd
@@ -71,6 +71,7 @@
(:file "xmls-compat" :depends-on ("xml-parse"))
(:file "recoder" :depends-on ("xml-parse"))
(:file "xmlns-normalizer" :depends-on ("xml-parse"))
+ (:file "space-normalizer" :depends-on ("xml-parse"))
(:file "catalog" :depends-on ("xml-parse"))
(:file "sax-proxy" :depends-on ("xml-parse")))
:depends-on (:cxml-runes :puri :trivial-gray-streams))
diff --git a/doc/dom.html b/doc/dom.html
index 2f06f66..1176270 100644
--- a/doc/dom.html
+++ b/doc/dom.html
@@ -23,7 +23,7 @@
diff --git a/doc/installation.html b/doc/installation.html
index c10ee97..70a1b7c 100644
--- a/doc/installation.html
+++ b/doc/installation.html
@@ -23,7 +23,7 @@
@@ -116,7 +116,7 @@ Compilation
* (asdf:operate 'asdf:load-op :cxml)
- You can then try the quick-start example.
+ You can then try the quick-start example.
diff --git a/doc/quickstart.html b/doc/quickstart.html
index 2afacd1..c69c3e1 100644
--- a/doc/quickstart.html
+++ b/doc/quickstart.html
@@ -23,7 +23,7 @@
diff --git a/doc/using.html b/doc/using.html
index ecbd003..622f571 100644
--- a/doc/using.html
+++ b/doc/using.html
@@ -23,7 +23,7 @@
@@ -388,6 +388,32 @@ Miscellaneous SAX handlers
start-element events before passing them on the next
handler.
+
+
Function CXML:MAKE-WHITESPACE-NORMALIZER (chained-handler &optional dtd)
+ Return a SAX handler which removes whitespace from elements that
+ have element content and have not been declared to
+ preserve space using an xml:space attribute.
+
+ Example:
+ (cxml:parse-file "example.xml"
+ (cxml:make-whitespace-normalizer (cxml-dom:make-dom-builder))
+ :validate t)
+ Example input:
+ <!DOCTYPE test [
+<!ELEMENT test (foo,bar*)>
+<!ATTLIST test a CDATA #IMPLIED>
+<!ELEMENT foo #PCDATA>
+<!ELEMENT bar (foo?)>
+<!ATTLIST bar xml:space (default|preserve) "default">
+]>
+<test a='b'>
+ <foo> </foo>
+ <bar> </bar>
+ <bar xml:space="preserve"> </bar>
+</test>
+
+ Example result:
+ <test a="b"><foo> </foo><bar></bar><bar xml:space="preserve"> </bar></test>
Recoders
@@ -572,6 +598,9 @@ SAX Interface
Accessor SAX:ATTRIBUTE-QNAME (attribute)
Accessor SAX:ATTRIBUTE-SPECIFIED-P (attribute)
Accessor SAX:ATTRIBUTE-VALUE (attribute)
+
+ Function SAX:FIND-ATTRIBUTE (qname attributes)
+ Function SAX:FIND-ATTRIBUTE-NS (uri lname attributes)
The entity declaration methods are similar to Java SAX
diff --git a/doc/xmls-compat.html b/doc/xmls-compat.html
index 4474b39..f959a8d 100644
--- a/doc/xmls-compat.html
+++ b/doc/xmls-compat.html
@@ -23,7 +23,7 @@
diff --git a/runes/ystream.lisp b/runes/ystream.lisp
index e574f3a..13f1af6 100644
--- a/runes/ystream.lisp
+++ b/runes/ystream.lisp
@@ -223,6 +223,7 @@
#+rune-is-integer
(progn
(defstruct (character-stream-ystream/utf8
+ (:constructor make-character-stream-ystream/utf8 (os-stream))
(:include %stream-ystream)
(:conc-name "YSTREAM-")))
diff --git a/xml/package.lisp b/xml/package.lisp
index 8fc1255..c3b358c 100644
--- a/xml/package.lisp
+++ b/xml/package.lisp
@@ -77,5 +77,6 @@
#:sax-proxy
#:proxy-chained-handler
#:make-namespace-normalizer
+ #:make-whitespace-normalizer
#:rod-to-utf8-string
#:utf8-string-to-rod))
diff --git a/xml/recoder.lisp b/xml/recoder.lisp
index 0816377..b40a2c1 100644
--- a/xml/recoder.lisp
+++ b/xml/recoder.lisp
@@ -118,4 +118,8 @@
(defwrapper sax:entity-resolver
(resolver)
- resolver))
+ resolver)
+
+ (defwrapper sax::dtd
+ (dtd)
+ dtd))
diff --git a/xml/sax-handler.lisp b/xml/sax-handler.lisp
index faec86c..364ab01 100644
--- a/xml/sax-handler.lisp
+++ b/xml/sax-handler.lisp
@@ -53,6 +53,8 @@
#:*use-xmlns-namespace*
#:make-attribute
+ #:find-attribute
+ #:find-attribute-ns
#:attribute-namespace-uri
#:attribute-local-name
#:attribute-qname
@@ -137,6 +139,23 @@ Setting this variable has no effect unless both
value
specified-p)
+(defun %rod= (x y)
+ ;; allow rods *and* strings *and* null
+ (cond
+ ((zerop (length x)) (zerop (length y)))
+ ((zerop (length y)) nil)
+ ((stringp x) (string= x y))
+ (t (runes:rod= x y))))
+
+(defun find-attribute (qname attrs)
+ (find qname attrs :key #'attribute-qname :test #'%rod=))
+
+(defun find-attribute-ns (uri lname attrs)
+ (find-if (lambda (attr)
+ (and (%rod= uri (sax:attribute-namespace-uri attr))
+ (%rod= lname (sax:attribute-local-name attr))))
+ attrs))
+
(defgeneric start-document (handler)
(:documentation "Called at the beginning of the parsing process,
before any element, processing instruction or comment is reported.
@@ -325,7 +344,11 @@ finished, if present.")
(:documentation
"Called between sax:end-dtd and sax:end-document to register an entity
resolver, a function of two arguments: An entity name and SAX handler.
- When called, the resolver function will parse the named entities data.")
+ When called, the resolver function will parse the named entity's data.")
(:method ((handler t) resolver)
(declare (ignore resolver))
nil))
+
+;; internal for now
+(defgeneric dtd (handler dtd)
+ (:method ((handler t) dtd) (declare (ignore dtd)) nil))
diff --git a/xml/sax-proxy.lisp b/xml/sax-proxy.lisp
index a6fa3da..4db10f0 100644
--- a/xml/sax-proxy.lisp
+++ b/xml/sax-proxy.lisp
@@ -37,4 +37,5 @@
(define-proxy-method sax:notation-declaration (name public-id system-id))
(define-proxy-method sax:element-declaration (name model))
(define-proxy-method sax:attribute-declaration (elt attr type default))
- (define-proxy-method sax:entity-resolver (resolver)))
+ (define-proxy-method sax:entity-resolver (resolver))
+ (define-proxy-method sax::dtd (dtd)))
diff --git a/xml/space-normalizer.lisp b/xml/space-normalizer.lisp
new file mode 100644
index 0000000..d4c9c14
--- /dev/null
+++ b/xml/space-normalizer.lisp
@@ -0,0 +1,62 @@
+;;;; space-normalizer.lisp -- whitespace removal
+;;;;
+;;;; This file is part of the CXML parser, released under Lisp-LGPL.
+;;;; See file COPYING for details.
+;;;;
+;;;; Copyright (c) 2005 David Lichteblau
+
+(in-package :cxml)
+
+(defclass whitespace-normalizer (sax-proxy)
+ ((attributes :initform '(t) :accessor xml-space-attributes)
+ (models :initform nil :accessor xml-space-models)
+ (dtd :initarg :dtd :accessor xml-space-dtd)))
+
+(defun make-whitespace-normalizer (chained-handler &optional dtd)
+ (make-instance 'whitespace-normalizer
+ :dtd dtd
+ :chained-handler chained-handler))
+
+(defmethod sax::dtd ((handler whitespace-normalizer) dtd)
+ (unless (xml-space-dtd handler)
+ (setf (xml-space-dtd handler) dtd)))
+
+(defmethod sax:start-element
+ ((handler whitespace-normalizer) uri lname qname attrs)
+ (declare (ignore uri lname))
+ (let ((dtd (xml-space-dtd handler)))
+ (when dtd
+ (let ((xml-space
+ (sax:find-attribute (if (stringp qname) "xml:space" #"xml:space")
+ attrs)))
+ (push (print(if xml-space
+ (rod= (rod (sax:attribute-value xml-space)) #"default")
+ (car (xml-space-attributes handler))))
+ (xml-space-attributes handler)))
+ (let* ((e (cxml::find-element (rod qname) dtd))
+ (cspec (when e (cxml::elmdef-content e))))
+ (push (and (consp cspec)
+ (not (and (eq (car cspec) '*)
+ (let ((subspec (second cspec)))
+ (and (eq (car subspec) 'or)
+ (eq (cadr subspec) :PCDATA))))))
+ (xml-space-models handler)))))
+ (call-next-method))
+
+(defmethod sax:characters ((handler whitespace-normalizer) data)
+ (cond
+ ((and (xml-space-dtd handler)
+ (car (xml-space-attributes handler))
+ (car (xml-space-models handler)))
+ (unless (every #'white-space-rune-p (rod data))
+ (warn "non-whitespace character data in element content")
+ (call-next-method)))
+ (t
+ (call-next-method))))
+
+(defmethod sax:end-element ((handler whitespace-normalizer) uri lname qname)
+ (declare (ignore uri lname qname))
+ (when (xml-space-dtd handler)
+ (pop (xml-space-attributes handler))
+ (pop (xml-space-models handler)))
+ (call-next-method))
diff --git a/xml/xml-parse.lisp b/xml/xml-parse.lisp
index 6d60367..0582ebc 100644
--- a/xml/xml-parse.lisp
+++ b/xml/xml-parse.lisp
@@ -2513,7 +2513,8 @@
(let ((dtd (dtd *ctx*)))
(sax:entity-resolver
(handler *ctx*)
- (lambda (name handler) (resolve-entity name handler dtd))))
+ (lambda (name handler) (resolve-entity name handler dtd)))
+ (sax::dtd (handler *ctx*) dtd))
(list :DOCTYPE name extid))))
(defun report-cached-dtd (dtd)