From b0615afdd9f0aefbd6582df34c9f3eab309e7803 Mon Sep 17 00:00:00 2001 From: dlichteblau Date: Thu, 29 Dec 2005 00:31:30 +0000 Subject: [PATCH] whitespace normalizer --- README.html | 3 +- cxml.asd | 1 + doc/dom.html | 2 +- doc/installation.html | 4 +-- doc/quickstart.html | 2 +- doc/using.html | 31 +++++++++++++++++++- doc/xmls-compat.html | 2 +- runes/ystream.lisp | 1 + xml/package.lisp | 1 + xml/recoder.lisp | 6 +++- xml/sax-handler.lisp | 25 +++++++++++++++- xml/sax-proxy.lisp | 3 +- xml/space-normalizer.lisp | 62 +++++++++++++++++++++++++++++++++++++++ xml/xml-parse.lisp | 3 +- 14 files changed, 135 insertions(+), 11 deletions(-) create mode 100644 xml/space-normalizer.lisp diff --git a/README.html b/README.html index 184555e..f11b49e 100644 --- a/README.html +++ b/README.html @@ -23,7 +23,7 @@
  • @@ -112,6 +112,7 @@

    Recent Changes

  • UTF-8 string support in DOM on Lisps without Unicode characters.
  • Sink API has been changed.
  • Support internal subset serialization.
  • +
  • Whitespace normalizer.
  • Gilbert Baumann has clarified the license as Lisp-LGPL.
  • Use trivial-gray-streams.
  • diff --git a/cxml.asd b/cxml.asd index 43a6647..4b85a45 100644 --- a/cxml.asd +++ b/cxml.asd @@ -71,6 +71,7 @@ (:file "xmls-compat" :depends-on ("xml-parse")) (:file "recoder" :depends-on ("xml-parse")) (:file "xmlns-normalizer" :depends-on ("xml-parse")) + (:file "space-normalizer" :depends-on ("xml-parse")) (:file "catalog" :depends-on ("xml-parse")) (:file "sax-proxy" :depends-on ("xml-parse"))) :depends-on (:cxml-runes :puri :trivial-gray-streams)) diff --git a/doc/dom.html b/doc/dom.html index 2f06f66..1176270 100644 --- a/doc/dom.html +++ b/doc/dom.html @@ -23,7 +23,7 @@
  • diff --git a/doc/installation.html b/doc/installation.html index c10ee97..70a1b7c 100644 --- a/doc/installation.html +++ b/doc/installation.html @@ -23,7 +23,7 @@
  • @@ -116,7 +116,7 @@

    Compilation

    * (asdf:operate 'asdf:load-op :cxml)

    - You can then try the quick-start example. + You can then try the quick-start example.

    diff --git a/doc/quickstart.html b/doc/quickstart.html index 2afacd1..c69c3e1 100644 --- a/doc/quickstart.html +++ b/doc/quickstart.html @@ -23,7 +23,7 @@
  • diff --git a/doc/using.html b/doc/using.html index ecbd003..622f571 100644 --- a/doc/using.html +++ b/doc/using.html @@ -23,7 +23,7 @@
  • @@ -388,6 +388,32 @@

    Miscellaneous SAX handlers

    start-element events before passing them on the next handler.

    +

    +

    Function CXML:MAKE-WHITESPACE-NORMALIZER (chained-handler &optional dtd)
    + Return a SAX handler which removes whitespace from elements that + have element content and have not been declared to + preserve space using an xml:space attribute. +

    +

    Example:

    +
    (cxml:parse-file "example.xml"
    +                 (cxml:make-whitespace-normalizer (cxml-dom:make-dom-builder))
    +                 :validate t)
    +

    Example input:

    +
    <!DOCTYPE test [
    +<!ELEMENT test (foo,bar*)>
    +<!ATTLIST test a CDATA #IMPLIED>
    +<!ELEMENT foo #PCDATA>
    +<!ELEMENT bar (foo?)>
    +<!ATTLIST bar xml:space (default|preserve) "default">
    +]>
    +<test a='b'>
    +  <foo>   </foo>
    +  <bar>   </bar>
    +  <bar xml:space="preserve">   </bar>
    +</test>
    +
    +

    Example result:

    +
    <test a="b"><foo>   </foo><bar></bar><bar xml:space="preserve">   </bar></test>

    Recoders

    @@ -572,6 +598,9 @@

    SAX Interface

    Accessor SAX:ATTRIBUTE-QNAME (attribute)
    Accessor SAX:ATTRIBUTE-SPECIFIED-P (attribute)
    Accessor SAX:ATTRIBUTE-VALUE (attribute)
    +
    +
    Function SAX:FIND-ATTRIBUTE (qname attributes)
    +
    Function SAX:FIND-ATTRIBUTE-NS (uri lname attributes)

    The entity declaration methods are similar to Java SAX diff --git a/doc/xmls-compat.html b/doc/xmls-compat.html index 4474b39..f959a8d 100644 --- a/doc/xmls-compat.html +++ b/doc/xmls-compat.html @@ -23,7 +23,7 @@

  • diff --git a/runes/ystream.lisp b/runes/ystream.lisp index e574f3a..13f1af6 100644 --- a/runes/ystream.lisp +++ b/runes/ystream.lisp @@ -223,6 +223,7 @@ #+rune-is-integer (progn (defstruct (character-stream-ystream/utf8 + (:constructor make-character-stream-ystream/utf8 (os-stream)) (:include %stream-ystream) (:conc-name "YSTREAM-"))) diff --git a/xml/package.lisp b/xml/package.lisp index 8fc1255..c3b358c 100644 --- a/xml/package.lisp +++ b/xml/package.lisp @@ -77,5 +77,6 @@ #:sax-proxy #:proxy-chained-handler #:make-namespace-normalizer + #:make-whitespace-normalizer #:rod-to-utf8-string #:utf8-string-to-rod)) diff --git a/xml/recoder.lisp b/xml/recoder.lisp index 0816377..b40a2c1 100644 --- a/xml/recoder.lisp +++ b/xml/recoder.lisp @@ -118,4 +118,8 @@ (defwrapper sax:entity-resolver (resolver) - resolver)) + resolver) + + (defwrapper sax::dtd + (dtd) + dtd)) diff --git a/xml/sax-handler.lisp b/xml/sax-handler.lisp index faec86c..364ab01 100644 --- a/xml/sax-handler.lisp +++ b/xml/sax-handler.lisp @@ -53,6 +53,8 @@ #:*use-xmlns-namespace* #:make-attribute + #:find-attribute + #:find-attribute-ns #:attribute-namespace-uri #:attribute-local-name #:attribute-qname @@ -137,6 +139,23 @@ Setting this variable has no effect unless both value specified-p) +(defun %rod= (x y) + ;; allow rods *and* strings *and* null + (cond + ((zerop (length x)) (zerop (length y))) + ((zerop (length y)) nil) + ((stringp x) (string= x y)) + (t (runes:rod= x y)))) + +(defun find-attribute (qname attrs) + (find qname attrs :key #'attribute-qname :test #'%rod=)) + +(defun find-attribute-ns (uri lname attrs) + (find-if (lambda (attr) + (and (%rod= uri (sax:attribute-namespace-uri attr)) + (%rod= lname (sax:attribute-local-name attr)))) + attrs)) + (defgeneric start-document (handler) (:documentation "Called at the beginning of the parsing process, before any element, processing instruction or comment is reported. @@ -325,7 +344,11 @@ finished, if present.") (:documentation "Called between sax:end-dtd and sax:end-document to register an entity resolver, a function of two arguments: An entity name and SAX handler. - When called, the resolver function will parse the named entities data.") + When called, the resolver function will parse the named entity's data.") (:method ((handler t) resolver) (declare (ignore resolver)) nil)) + +;; internal for now +(defgeneric dtd (handler dtd) + (:method ((handler t) dtd) (declare (ignore dtd)) nil)) diff --git a/xml/sax-proxy.lisp b/xml/sax-proxy.lisp index a6fa3da..4db10f0 100644 --- a/xml/sax-proxy.lisp +++ b/xml/sax-proxy.lisp @@ -37,4 +37,5 @@ (define-proxy-method sax:notation-declaration (name public-id system-id)) (define-proxy-method sax:element-declaration (name model)) (define-proxy-method sax:attribute-declaration (elt attr type default)) - (define-proxy-method sax:entity-resolver (resolver))) + (define-proxy-method sax:entity-resolver (resolver)) + (define-proxy-method sax::dtd (dtd))) diff --git a/xml/space-normalizer.lisp b/xml/space-normalizer.lisp new file mode 100644 index 0000000..d4c9c14 --- /dev/null +++ b/xml/space-normalizer.lisp @@ -0,0 +1,62 @@ +;;;; space-normalizer.lisp -- whitespace removal +;;;; +;;;; This file is part of the CXML parser, released under Lisp-LGPL. +;;;; See file COPYING for details. +;;;; +;;;; Copyright (c) 2005 David Lichteblau + +(in-package :cxml) + +(defclass whitespace-normalizer (sax-proxy) + ((attributes :initform '(t) :accessor xml-space-attributes) + (models :initform nil :accessor xml-space-models) + (dtd :initarg :dtd :accessor xml-space-dtd))) + +(defun make-whitespace-normalizer (chained-handler &optional dtd) + (make-instance 'whitespace-normalizer + :dtd dtd + :chained-handler chained-handler)) + +(defmethod sax::dtd ((handler whitespace-normalizer) dtd) + (unless (xml-space-dtd handler) + (setf (xml-space-dtd handler) dtd))) + +(defmethod sax:start-element + ((handler whitespace-normalizer) uri lname qname attrs) + (declare (ignore uri lname)) + (let ((dtd (xml-space-dtd handler))) + (when dtd + (let ((xml-space + (sax:find-attribute (if (stringp qname) "xml:space" #"xml:space") + attrs))) + (push (print(if xml-space + (rod= (rod (sax:attribute-value xml-space)) #"default") + (car (xml-space-attributes handler)))) + (xml-space-attributes handler))) + (let* ((e (cxml::find-element (rod qname) dtd)) + (cspec (when e (cxml::elmdef-content e)))) + (push (and (consp cspec) + (not (and (eq (car cspec) '*) + (let ((subspec (second cspec))) + (and (eq (car subspec) 'or) + (eq (cadr subspec) :PCDATA)))))) + (xml-space-models handler))))) + (call-next-method)) + +(defmethod sax:characters ((handler whitespace-normalizer) data) + (cond + ((and (xml-space-dtd handler) + (car (xml-space-attributes handler)) + (car (xml-space-models handler))) + (unless (every #'white-space-rune-p (rod data)) + (warn "non-whitespace character data in element content") + (call-next-method))) + (t + (call-next-method)))) + +(defmethod sax:end-element ((handler whitespace-normalizer) uri lname qname) + (declare (ignore uri lname qname)) + (when (xml-space-dtd handler) + (pop (xml-space-attributes handler)) + (pop (xml-space-models handler))) + (call-next-method)) diff --git a/xml/xml-parse.lisp b/xml/xml-parse.lisp index 6d60367..0582ebc 100644 --- a/xml/xml-parse.lisp +++ b/xml/xml-parse.lisp @@ -2513,7 +2513,8 @@ (let ((dtd (dtd *ctx*))) (sax:entity-resolver (handler *ctx*) - (lambda (name handler) (resolve-entity name handler dtd)))) + (lambda (name handler) (resolve-entity name handler dtd))) + (sax::dtd (handler *ctx*) dtd)) (list :DOCTYPE name extid)))) (defun report-cached-dtd (dtd)