#####################################################
# note those HTML tags...
proc removeAccentsEnglishLongWebster { chunk } {
  regsub -all {[ˉ̌]} $chunk {a} chunk
  regsub -all {[]} $chunk {c} chunk
  regsub -all {[]} $chunk {e} chunk
  regsub -all {[]} $chunk {i} chunk
  regsub -all {[]} $chunk {n} chunk
  regsub -all {[Ϳ]} $chunk {o} chunk
  regsub -all {[]} $chunk {u} chunk
  regsub -all {[]} $chunk {y} chunk
  regsub -all {[]} $chunk {ae} chunk
  regsub -all {[]} $chunk {oe} chunk
  regsub -all {&([aAoO])[eE]lig;} $chunk {\1e} chunk
  regsub -all {&ccedil;} $chunk {c} chunk
  regsub -all {&Ccedil;} $chunk {c} chunk
  regsub -all {&([a-zA-Z])uml;} $chunk {\1} chunk
  regsub -all {&([a-zA-Z])acute;} $chunk {\1} chunk
  regsub -all {&([a-zA-Z])circ;} $chunk {\1} chunk
  regsub -all {&([a-zA-Z])grave;} $chunk {\1} chunk
  regsub -all {&([a-zA-Z])tilde;} $chunk {\1} chunk
  regsub -all {&amacr;} $chunk {a} chunk
  regsub -all {&ouml;} $chunk {o} chunk
  return $chunk
}

# This proc extracts the headword from a line in longWebster
# strips all accents, and then removes all non-alpha letters
proc Dict::Webster1913unabr::normalForm { ord } {
  regexp {^<p><hw>([^<]*)</hw>} $ord dummy ord
  set ord [removeAccentsEnglishLongWebster $ord]
  regsub -all {[^A-Za-z]} $ord {} ord ; # remove the space, because such is it sorted
  return [string tolower [string trim $ord]]
}

# proc longWebsterCompare { et to } { 
# 	return [string compare [longWebsterNormalForm $et] \
# 	 [longWebsterNormalForm $to]]
# }
# 



proc Dict::Webster1913unabr::formatVerbet { linje } {
	# IT IS IMPORTANT TO SUBS HTML TAGS BEFORE OTHER FORMATING
	# BECAUSE THOSE & CAN REALLY CONFUSE THER REGSUBS...
	regsub -all "&aelig;" $linje "" linje
	regsub -all "&AElig;" $linje "" linje
	regsub -all "&oelig;" $linje "" linje
	regsub -all "&OElig;" $linje "" linje
	regsub -all "&ouml;" $linje "" linje
	regsub -all "&euml;" $linje "" linje
	regsub -all "&sect;" $linje "" linje
	regsub -all "&eacute;" $linje "" linje
	regsub -all "&egrave;" $linje "" linje
	regsub -all "&ecirc;" $linje "" linje
	regsub -all "&amacr;" $linje "a" linje
	regsub -all {[ ]?\(&\?;\)} $linje "" linje

	# now for the headword
	set hwexp {<p><hw>([^<]*)</hw>}
	regexp $hwexp $linje dummy headw
	#use the following line to get rid of pronounciation symbols in headword:
	regsub -all {[`\"\*\|]} $headw "" headw
	
	regsub -all $hwexp $linje "   $headw" linje
	
	#sometimes there are two headwords, handle the second one too
	set hwexp {<hw>([^<]*)</hw>}
	regexp $hwexp $linje dummy headw
	#use the following line to get rid of pronounciation symbols in headword:
	regsub -all {[`\"\*\|]} $headw "" headw	
	regsub -all $hwexp $linje $headw linje
	#sometimes there are two variations of the headwords, handle those:
	set hwexp {<wf>([^<]*)</wf>}
	regexp $hwexp $linje dummy headw
	#use the following line to get rid of pronounciation symbols in headword:
	regsub -all {[`\"\*\|]} $headw "" headw	
	regsub -all $hwexp $linje $headw linje
	
	
	regsub -all "<pos><i>" $linje {(} linje
	regsub -all "</i></pos>" $linje {)} linje
	
	regsub -all " ?\\(\\?\\),?" $linje "" linje
  regsub -all {[ ]?\(\?; [ 0-9]*\)} $linje "" linje

	#DEFINITIONS
	#the following is to indent and bullet those word definitions
	#which are not numbered...
	if { [regsub -- (<def>|<sn>) $linje "\r    " linje] } {
		#do nothing (it's allready done in the condition)
	}
  regsub -all (<sn>) $linje "\r    " linje
  regsub -all {&fist;[ ]?} $linje "\r   Obs.  " linje
  regsub -all (<def>) $linje " " linje
  regsub -all (</def>|</sn>) $linje "" linje
	
	regsub -all {<p><b>Syn.</b>} $linje "\r   Syn." linje 

	#try to get rid of etymology:
  regsub -all {\[[A-Z]([^\[]*)\]} $linje "" linje
	
	
	#try to catch citations:
	regsub -all {<p><blockquote>(([^<]|<[^/]|</[^b]|</b[^l]|</bl[^o])*)</blockquote> <i>([^<]*)</i>} \
	 $linje "\"\\1\" (\\3) " linje
	regsub -all {<p><blockquote>(([^<]|<[^/]|</[^b]|</b[^l]|</bl[^o])*)[ ]?<BR>[ ]?<i>([^<]+)</i>[ ]?</blockquote>} \
	 $linje "\"\\1\" (\\3) " linje


	#OTHERS
  regsub -all \
	 (</def>|<u>|</u>|<b>|</b>|<col>|</col>|<sd>|</sd>|<cd>|</cd>) \
	           $linje "" linje
  regsub -all "(<i>|</i>|</p>|<plw>|</plw>)" $linje "" linje
  regsub -all (<p>) $linje "" linje
  regsub -all (<BR>|<hw>|</hw>) $linje "" linje
#   regsub -all {<sn><b>([0-9]+.)</b></sn>} $linje "\r   \\1" linje
#   regsub -all {<sn>([0-9]+.)</sn>} $linje "\r   \\1" linje
#
# 	regsub -all { ([1-9]) } $linje {x\1x} linje
# 	regsub -all { } $linje {x} linje
# 	regsub -all { } $linje {x} linje
# 	regsub -all { } $linje {x} linje
#   regsub -all (\r\r) $linje "\\r" linje ; # can't get this to work...
	set linje [betterBreak $linje]
# 	regsub -all {x([1-9])x} $linje { \1 } linje
# 	regsub -all {x} $linje { } linje
# 	regsub -all {x} $linje { } linje
# 	regsub -all {x} $linje { } linje
  regsub -all \r\r $linje \r linje
	return $linje
}

proc betterBreak { chunk } {
	set chList [split $chunk \r]
	foreach p $chList {
		lappend pL [breakIntoLines $p]
	}
	return [join $pL \r]
}

# Removes all tags from a string.
proc Dict::TagStrip {str} {
	regsub -all {<[^<>]*>} $str "" str
	return $str
}
