#!/usr/bin/env ruby
# = sisu - SiSU Simple information Structuring Universe
#
# Copyright (c) Ralph Amissah 1997,2004
#
# Ralph Amissah mailto:ralph@amissah.com
#
# * Name: SiSU - Simple information Structuring Universe   
# * Author: Ralph@Amissah.com
# * Description: document conversion tool, to sisu from other formats
#   * arch-tag: document conversion tool to sisu markup
#   * $Date: 2004/10/16 15:51:06 $
#   * $Id: sisu_convert,v 1.37 2004/10/16 15:51:06 ralph Exp $
# *  License: GPL 2 or later
# * Notes: word conversion uses wvWare and wvSiSU.xml (a modified/stripped wvHtml.xml) 
#   * http://wvware.sourceforge.net/
#   * http://sourceforge.net/projects/wvware
# * <url:sisu.lnk>|sisu.lnk|@|^| 
# * <url:sisu>
# * <url:zxy_param.rb>|zxy_param.rb|@|^| 
module CONVERT
  class MyOutput
    def initialize(data, filename, instruct)
      @data=data.compact
      @filename=filename
      @instruct=instruct
    end
	  def headerBasic
      <<WOK
0~title 

0~subtitle 

0~creator 

0~type 

0~subject

0~date

0~date.available

0~publisher SiSU

0~rights 

0~level

WOK
	  end
	  def headerDefault
      <<WOK
0~title 

0~subtitle 

0~creator 

0~type 

0~subject

0~date

0~date.available

0~publisher SiSU

0~rights ...

WOK
    end
	  def hardOutput
      pre = Array.new
      case @instruct
      when /default/
        pre << headerDefault
      else
        pre << headerBasic
      end
	  	@filename_wv=File.new(%{,,#{@filename}.er9}, "w+")
      @filename_wv << pre
      @data.each do |x| 
        y = x.split("\n")
        y.each do |z| # cleaner output this way
          z.strip!
          @filename_wv.puts "#{z}\n\n" unless z =~/^$/
        end
      end
	  end
  end
  class WareWord97
    def initialize(data, filename, instruct)
      @data=data
      @filename=filename
      @instruct=instruct
    end
	  def songsheet
      data=@data
    	print "Convert to SiSU file from Word97 << gvim ,,#{@filename}.er9 >\n" #: <<#{@@html_title}>>
	    data=WareWord97.new(data.collect, @filename, @instruct).strip
	    data=WareWord97.new(data.collect, @filename, @instruct).strip
	    data=WareWord97.new(data.collect, @filename, @instruct).markup_rules
	  	data=MyOutput.new(data.collect, @filename, @instruct).hardOutput
	  end
    def strip
      data=@data
      tuned_file=Array.new
      endnote_no=1
      data.each do |para|
		  	para.strip!
        para.gsub!(/<u>\s*<\/u>/, '')
        para.gsub!(/<\/u>\s*<u>/, '')
        para.gsub!(/<b>\s*<\/b>/, '')
        para.gsub!(/<\/b>\s*<b>/, '')
        para.gsub!(/<i>\s*<\/i>/, '')
        para.gsub!(/<\/i>\s*<i>/, '')
      	tuned_file << para unless para == nil
		  end
      tuned_file
    end
    def markup_rules
      data=@data
      tuned_file=Array.new
      endnote_no=1
      data.each do |para|
		  	para.strip!
        para.gsub!(/\s+/, ' ')
        para.gsub!(/^<b>(Chapter|Article)(.+?)<\/b>/i, "4{ \\1 \\2") #watch case insensitivity
        para.gsub!(/^<b>(Part|Section|Book)(.+?)<\/b>/i, "3{ \\1 \\2") #watch case insensitivity
        para.gsub!(/^<b>(\d+\.\d+\.\d+)(.+?)<\/b>/i, "6{ \\1 \\2") #numeric, decide what to do, can be different
        para.gsub!(/^<b>(\d+\.\d+)(.+?)<\/b>/i, "5{ \\1 \\2") #numeric, decide what to do, can be different
        para.gsub!(/^<b>(\d+)(.+?)<\/b>/i, "4{ \\1 \\2") #numeric, decide what to do, can be different
        para.gsub!(/<u>(.+?)<\/u>/, "_{\\1}_")
        para.gsub!(/<b>(.+?)<\/b>/, "*{\\1}*")
        para.gsub!(/<i>(.+?)<\/i>/, "/{\\1}/")
      	tuned_file << para unless para == nil
		  end
      tuned_file
    end
  end
  class Html
    def initialize(data, filename, instruct)
      @data=data
      @filename=filename
      @instruct=instruct
    end
	  def songsheet
      data=@data
    	print "Convert to SiSU file from #{@filename}.html << gvim ,,#{@filename}.er9 >\n" #: <<#{@@html_title}>>
      #data=Html.new(data.collect, @filename, @instruct).space_paragraphs
      #data=Html.new(data.split(''), @filename, @instruct).space_paragraphs
      data=Html.new(data.split("\n"), @filename, @instruct).space_paragraphs
      #data=Html.new(data.collect.join.split("\n"), @filename, @instruct).space_paragraphs
      data=Html.new(data.collect, @filename, @instruct).multiline
      data=Html.new(data.collect.join.split("\n\n"), @filename, @instruct).markup_rules
	  	data=MyOutput.new(data.collect, @filename, @instruct).hardOutput
	  end
    def space_paragraphs
      #data=@data.join.split(/\n/)
      data=@data
      #p data.length
      tuned_file=Array.new
      data.each do |para|
		  	para.strip!
        para.gsub!(/\r/, '')
        #para.gsub!(/\n/, ' ') #PROBLEM, serious time issues on a few files also for \n (or multiline matches which is less surprising), edit out if necessary
        para.gsub!(/<\/?p>/i, 'zZz')
        para.gsub!(/<\/?\s*p(?:\s+ALIGN=.+?)?>/i, "zZz") #all manner of <p> para.gsub!(/<\/?p>/i, "\n\n")
        para.gsub!(/<p\s+(class|align).+?>/i, "zZz") #
        para.gsub!(/<\/p>/i, "zZz") # repeat actually
        para.gsub!(/<(?:dir|tr|br)>/i, "zZz") #
        #para.gsub!(/<(?:\/\s*)?(?:dir|tr|br)>/i, "zZz") #
        para.gsub!(/(<\/center>)/i, "\\1zZz")
        para.gsub!(/(<\/h[1-6]>)/i, "\\1zZz")
        para.gsub!(/ \s+/i, ' ')
        para.gsub!(/(?:\s*zZz\s*)+/i, "zZz") #
      	tuned_file << para unless para == nil
      end
      tuned_file
    end
    def blockquotes(sub='') # SERIOUS PROBLEM INTRODUCED, some blockquotes go missing !, quite unacceptable, debug, for now not used
      res=Array.new
      sub.each do |x|
        if x=~/(<\/blockquote>)/i
          m = $1
          res << x[/(.+?)#{m}/mi, 1].gsub!(/zZz/,"zZz_1 ") if x =~/.+?#{m}/mi 
          res << x[/#{m}(.+)/mi, 1]
        else
          res << x #[/(.+)/mi, 1]
        end
      end
      res.join
    end
    def multiline
      data=@data
      tuned_file=Array.new
      data.each do |para|
        para.gsub!(/\n/, ' ')
        para.gsub!(/ \s+/mi, ' ')
       #ALL HERE could be very time EXPENSIVE but tamed? compromise ... /mi
        para.gsub!(/<([biu]|h[1-6])>(?:zZz)?([^<]+)?zZz(.+?)<\/\1>/i, "zZz<\\1>\\2 \\3</\\1>")
        para.gsub!(/<([biu]|h[1-6])>(?:<center>|zZz)+(.+?)(?:<\/center>)?zZz(.+?)?<\/\1>/i, "zZz<\\1>\\2 \\3</\\1>")
        #para.gsub!(/<([biu]|h[1-6])>(?:<center>|zZz)+(.+?)<\/center>zZz(.+?)?<\/\1>/i, "zZz<\\1>\\2 \\3</\\1>")
        para.gsub!(/<([biu]|h[1-6])>(?:<center>|zZz)+(.+?)<\/\1>/i, "zZz<\\1>\\2</\\1>")
        para.gsub!(/<(h[1-6])>(.+?)(?:<center>|zZz)+<\/\1>/i, "zZz<\\1>\\2</\\1>zZz") #does catch some h1, h2 etc, too expensive to have biu
        #para.gsub!(/<([biu]|h[1-6])>(.+?)(?:<center>|zZz)+<\/\1>/i, "zZz<\\1>\\2 \\3</\\1>") #may go too far? useful for h1 h2 etc, remove biu?
        #para.gsub!(/<([biu]|h[1-6])>([^<]+)?zZz(.+?)<\/\1>/i, "zZz<\\1>\\2 \\3</\\1>")
        #para.gsub!(/<([biu]|h[1-6])>([^<]+)?zZz(.+?)<\/\1>/i, "zZz<\\1>\\2 \\3</\\1>")
       ### SERIOUS PROBLEM INTRODUCED
       # sub = para.split(/<blockquote>/i)
       # para = blockquotes(sub) if sub.length > 0 #check was on >1 could have serious repercussions 2004w29
        para.gsub!(/zZz(\s*zZz)*/, "\n\n")
      	tuned_file << para << "\n\n" unless para == nil
      end
      tuned_file
    end
    def markup_rules
      data=@data
      tuned_file=Array.new
      data.each do |para|
        if para=~/<a href="(http:\/\/.+?)">/i
          #p para.grep(/<a href="(http:\/\/.+?)">/i)
          #m=$1
          #para.gsub!(/(?:&lt;\s*)?<a href="#{m}">#{m}<\/a>(?:\s*&gt;)?\.?/i, "#{m}")
          para.gsub!(/(?:&lt;\s*)?<a href="(http:\/\/.+?)">http:\/\/.+?<\/a>(?:\s*&gt;)?\.?/i, "\\1") #risk that url & url are not to match
          #para.gsub!(/(?:&lt;\s*)?<a href="(\w+\.html)">(http:\/\/.+?\/\1)<\/a>(?:\s*&gt;)?\.?/i, "\\2") #does not match
        end
        ### clean
        para.gsub!(/^\s+/i, '')
        para.gsub!(/<([bui]|em|su[pb])>\s*<\/\1>/i, '')
        para.gsub!(/<\/?center>/i, '')
        para.gsub!(/\s*<\/dir>/i, '')
        para.gsub!(/<hr>/i, '')
        para.gsub!(/\s*<a href=".+?\.html#(?:[a-z_]+)?(?:[a-z0-9_-]|\*)+">\[(\*+)\]<\/a>/i, "^{[\\1]}^ ") #other endnote marker
        para.gsub!(/<a href=".+?\.html#(?:[a-z_$]+)?[0-9_-]+"(?:\s+name=".+?")?>\[[a-z]?\d+\](?:<\/[bi]>)?<\/a>/i, '~e ') #endnote marker
        para.gsub!(/<a name=".+?"\s+href=".+?\.html#(?:[a-z_$]+)?[0-9_-]+"?>\[[a-z]?\d+\](?:<\/[bi]>)?<\/a>/i, '~e ') #endnote marker
        para.gsub!(/<a name="(?:[a-z$]+)?[0-9_-]+">\s*(<\/a>)?\s*\d+\.?\s*(<\/a>)?\s*/i, '~{{ ') #endnote
        #para.gsub!(/<h([1-6])>\s*(.+?)\s*<\/h\1>\s*/i, "\\1{ \\2") #
        para.gsub!(/<h([1-6])(?: align=.+?)?>\s*(.+?)\s*<\/h\1>\s*/i, "\\1{ \\2") #
        para.gsub!(/^<b>(Chapter|Article)(.+?)<\/b>/i, "4{ \\1 \\2") #watch case insensitivity
        para.gsub!(/^<b>(Part|Section|Book)(.+?)<\/b>/i, "3{ \\1 \\2") #watch case insensitivity
        para.gsub!(/^<b>(\d+\.\d+\.\d+)(.+?)<\/b>/i, "6{ \\1 \\2") #numeric, decide what to do, can be different
        para.gsub!(/^<b>(\d+\.\d+)(.+?)<\/b>/i, "5{ \\1 \\2") #numeric, decide what to do, can be different
        para.gsub!(/^<b>(\d+)(.+?)<\/b>/i, "4{ \\1 \\2") #numeric, decide what to do, can be different
        #<a name="ii"></a><B>
        para.gsub!(/^(<a name=".+?">)(?:<small>)?<(?:b|strong)>\s*(.+?)\s*<\/(?:b|strong)>/i, "5{ \\2 \\1") #watch
        para.gsub!(/^(<(a name|A NAME)=".+?">)(\s*|<\/[aA]>)?([A-Z][A-Z])+/, "5{ \\2 \\1") #watch
        para.gsub!(/^(\s+|<p>)?(<a name=".+?">)(\s*|<\/a>)?<b>/i, "5{ \\2 \\1") #watch
        para.gsub!(/<h([1-6])>\s*(.+?)\s*<\/h\1>\s*/i, "\\1{ \\2") #
        para.gsub!(/^<b>\s*(.+?)<\/b>\s*(<\/i>\s*)?$/i, "4{ \\1\\2") # wish it all were less messy
        para.gsub!(/^<i>\s*([^"(].+?)<\/i>\s*(<\/b>\s*)?$/i, "5{ \\1\\2") # wish it all were less messy
        para.gsub!(/<\/?[biu]>/i, '') if para =~/[1-6]\{/
        para.gsub!(/<u>\s*(.+?)\s*<\/u>/i, "_{\\1}_")
        para.gsub!(/<(b|strong)>\s*(.+?)\s*<\/\1>/i, "*{\\2}*")
        para.gsub!(/<(i|em)>\s*(.+?)\s*<\/\1>/i, "/{\\2}/")
        para.gsub!(/<sup>\s*(.+?)\s*<\/sup>/i, "^{\\1}^")
        para.gsub!(/(([\/\*!_])\{.+?\}\2)\s\s+/i, "\\1 ")
        para.gsub!(/(([\/\*!_])\{.+?\}\2)\s+([.,;?\)])\s+/i, "\\1\\3 ")
        para.gsub!(/(([\/\*!_])\{.+?\}\2)(["'])\s+/i, "\\1\\3 ")
        para.gsub!(/(([\/\*!_])\{.+?\}\2)\s*([a-z0-9])/i, "\\1 \\3")
        para.gsub!(/(([\/\*_])\{.+?\}\2)\s*([a-z0-9])/i, "\\1 \\3")
        para.gsub!(/([a-z0-9])(([\/\*_])\{.+?\}\3)/i, " \\1 \\2") #eg this/{problem}/
        para.gsub!(/([\/\*_])\{([,.;; ]+)\}\1/i, "\\2") #eg /{,}/ or *{ }* etc.
        para.gsub!(/ \s+/i, ' ')
        #para.gsub!(/\/\{\*\{/i, '*{/{')
        #para.gsub!(/\}\*\}\//i, '}/}*')
        para.gsub!(/&quot;/i, '"')
        para.gsub!(/&amp;/i, 'and')
        para.gsub!(/<!doctype html public .+/i, '')
        para.gsub!(/<\/?(?:html|head|body|font|small)>/i, '')
        para.gsub!(/<\/(?:title)>/i, '')
        para.gsub!(/<title>/i, '#{~title? ')
        para.gsub!(/<blockquote>(.+?)<\/blockquote>/mi, "\n\n_1 \\1\n\n")
        para.gsub!(/<div align=.+?>|<\/div>|<font size=.+?>|<\/a><\/em><\/strong>/i, '')
        para.gsub!(/~e\s+\.\s*/i, ".~e ") #check vim equiv # %s/\~e\s\+\.\s*/.\~e /c
        para.gsub!(/\s+~e\s+/i, "~e ")
        para.gsub!(/ \s+/i, ' ')
        para.gsub!(/\s+$/i, '')
        para.gsub!(/^(?:<\/[bi]>)+$/i, '')
        para.gsub!(/^(?:(?:<i>)+<b>|(?:<b>)+<i>)\s*([^"(].+?)/i, "5{ \\1\\2") # wish it all were less messy
        para.gsub!(/^(?:<\/?(?:[ib]|em)>\s*)+$/i, '') # cleaning up left over <i> etc.
        para.gsub!(/<(?:i|em)>\s*(.+)/i, "/{\\1}/") # using up left over <i>
        para.gsub!(/<b>\s*(.+)/i, "*{\\1}*") # using up left over <b>
        #para.gsub!(/^(?:<(?:\/)?[bi]>)+$/i, '')
      	tuned_file << para unless para == nil
		  end
      tuned_file
    end
  end
  class Default < Html
    def initialize(data, filename, instruct)
      @data=data
      @filename=filename
      @instruct=instruct
    end
	  def songsheet
      data=@data
    	print "Convert to SiSU file from #{@filename}.html << gvim ,,#{@filename}.er9 >\n" #: <<#{@@html_title}>>
	    data=Default.new(data.collect, @filename, @instruct).space_paragraphs
      data=Default.new(data.collect, @filename, @instruct).multiline
      data=Default.new(data.collect.join.split("\n\n"), @filename, @instruct).markup_rules
	    data=Default.new(data.collect, @filename, @instruct).markup_default
	  	data=MyOutput.new(data.collect, @filename, @instruct).hardOutput
	  end
    def markup_default
      data=@data
      tuned_file=Array.new
      data.each do |para|
        para.gsub!(/<i>(Id\.?)(\s|$)/i, "/\{\\1\}\\2/")
        para.gsub!(/^(~\{\{ .+?)(<\/LI>\s*|<\/OL>\s*)+$/i, "\\1")
        para.gsub!(/\/\{Id\.\s*<\/LI>\s*\}\//i, '/{Id.}/')
      	tuned_file << para unless para == nil
		  end
      tuned_file
    end
  end
end
def help
      puts <<WOK
conversion program
initial SiSU markup from other file formats

  zxy_convert --word    does initial conversion from word97 to sisu markup, expects [filename].doc (can also use --doc)
  zxy_convert --html    does initial conversion from html to sisu markup, expects [filename].html
  zxy_convert --default does initial conversion from defalt html to sisu markup, expects [filename].html

WOK
end
def doWord(argv, instruct)
  argv.each do  |f|
    if f =~/.+?\.doc$/
      @argv << f[/(.+?)\.doc$/, 1]
    else
      print "not .doc? << #{f} >> "
    end
  end
  @argv.each do |filename|
    system(%{wvWare -x #{@dir.home}/.sisu/convert/wvSiSU.xml #{filename}.doc > #{filename}.wv})
    file_array=IO.readlines("#{filename}.wv", "")
  	CONVERT::WareWord97.new(file_array, filename, instruct).songsheet # metaverse created here
  end
end
def doHtml(argv, instruct)
  argv.each do  |f|
    if f =~/.+?\.html$/
      @argv << f[/(.+?)\.html$/, 1]
    else
      print "not .html? << #{f} >> "
    end
  end
  @argv.each do |filename|
    file_array=IO.readlines("#{filename}.html", "\n\r")
  	CONVERT::Html.new(file_array, filename, instruct).songsheet # metaverse created here
  end
end
def doDefault(argv, instruct)
  argv.each do  |f|
    if f =~/.+?\.html$/
      @argv << f[/(.+?)\.html$/, 1]
    else
      print "not .html? << #{f} >> "
    end
  end
  @argv.each do |filename|
    file_array=IO.readlines("#{filename}.html", "\n\r")
  	CONVERT::Default.new(file_array, filename, instruct).songsheet # metaverse created here
  end
end
def cases(argv, instruct)
	case instruct
		when/^--(word(97)?|doc)$/i #creates minimal sisu_small.gz package to send
      doWord(argv, instruct)
		when/^--(html)$/i #creates sisu.gz package to send
      doHtml(argv, instruct)
		when/^--(default)$/i #creates sisu.gz package to send
      doDefault(argv, instruct)
		else
      help
	end
end
require 'zxy_sysenv.rb'
include SiSU_Env
@dir=SiSU_Env::Info_dir.new
@argv=Array.new
argv=$*
instruct = "#{argv[0].to_s}"
argv.shift
instruct.chomp!
instruct = "help" if instruct.nil? or instruct == ""; 
cases(argv, instruct)
