# Samizdat HTML validation
#
#   Copyright (c) 2002-2005  Dmitry Borodaenko <angdraug@debian.org>
#
#   This program is free software.
#   You can distribute/modify this program under the terms of
#   the GNU General Public License version 2 or later.
#
# vim: et sw=2 sts=2 ts=8 tw=0

require 'cgi'
require 'yaml'
require 'rexml/document'

require 'tidy'

# use (") instead of (') in XML attributes, escape both of them
#
module REXML

class Attribute
  def to_string
    %{#@expanded_name="#{to_s().gsub(/"/, '&quot;').gsub(/'/, '&apos;')}"}
  end
end

end   # module REXML


module Samizdat

class Sanitize
  TIDY_PATH = '/usr/lib/libtidy.so'

  def initialize(xhtml, tidypath=TIDY_PATH)
    @xhtml = xhtml

    # workaround for memory leak in Tidy.path=
    if not defined?(@@tidypath) or tidypath != @@tidypath
      Tidy.path = @@tidypath = tidypath
    end
  end

  attr_reader :xhtml

  CSS = Regexp.new(%r{
    \A\s*
    ([-a-z0-9]+) : \s*
    (?: (?: [-./a-z0-9]+ | \#[0-9a-f]+ | [0-9]+% ) \s* ) +
    \s*\z
  }xi).freeze

  def check_style(css, style)
    style.split(';').each do |s|
      return false unless
        s =~ CSS and css.include? $1
    end
    true
  end

  # compare elements and attributes with xhtml.yaml
  #
  def sanitize_element(xml, filter=@xhtml)
    if xml.name =~ /^_/ or not filter.keys.include?(xml.name)
      # doesn't work without xpath
      xml.document.delete_element(xml.xpath)
      return
    end
    if xml.has_attributes?
      attrs = filter['_common'].merge((filter[xml.name] or {}))
      xml.attributes.each_attribute do |a|
        xml.delete_attribute(a.name) unless attrs[a.name] === a.to_s
        if 'style' == a.name and filter['_css']
          # sanitize CSS in style="" attributes
          xml.delete_attribute(a.name) unless
            check_style(filter['_css'], a.value)
        end
      end
    end
    if xml.has_elements?   # recurse
      xml.elements.each {|e| sanitize_element(e) }
    end
  end

  # filter HTML through Tidy
  #
  def tidy(html)
    xml = Tidy.open(:output_xhtml => true, :literal_attributes => true,
      :tidy_mark => false, :wrap => 0, :char_encoding => 'utf8'
    ) {|tidy| tidy.clean(html.to_s.untaint) }

    xml.taint
  end

  # return sanitized HTML
  #
  def sanitize(html, fragment=true, filter=@xhtml)
    begin
      xml = REXML::Document.new(tidy(html)).root
      xml = xml.elements['//html/body'] if fragment   # work around tidy
    rescue REXML::ParseException
      raise RuntimeError, "Invalid HTML detected: " +
        CGI.escapeHTML($!.continued_exception.to_s.gsub!(/\n.*/, ''))
    end
    sanitize_element(xml, filter)
    html = ''
    if fragment
      xml.each {|child| child.write(html, -1, false, true) }
    else
      xml.write(html, -1, false, true)
    end
    html
  end
end

end   # module Samizdat
