Charset.java

package edu.berkeley.cs.nlp.ocular.data.textreader;

import static edu.berkeley.cs.nlp.ocular.util.CollectionHelper.makeMap;
import static edu.berkeley.cs.nlp.ocular.util.CollectionHelper.makeSet;
import static edu.berkeley.cs.nlp.ocular.util.CollectionHelper.setUnion;
import static edu.berkeley.cs.nlp.ocular.util.Tuple2.Tuple2;
import static edu.berkeley.cs.nlp.ocular.util.Tuple3.Tuple3;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import edu.berkeley.cs.nlp.ocular.util.StringHelper;
import edu.berkeley.cs.nlp.ocular.util.Tuple2;
import edu.berkeley.cs.nlp.ocular.util.Tuple3;
import tberg.murphy.indexer.Indexer;

/**
 * @author Dan Garrette (dhgarrette@gmail.com)
 */
public class Charset {

	public static final String SPACE = " ";
	public static final String HYPHEN = "-";
	public static final Set<String> LOWERCASE_LATIN_LETTERS = makeSet("a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "\u03B4");
	public static final Set<String> LOWERCASE_VOWELS = makeSet("a", "e", "i", "o", "u");
	public static final Set<String> SPECIALS = makeSet("\uA75D", "\u204A", "\uA76F", "\uA749", "\uA753", "\uA770", "\uA751", "\uA757");
	public static final Map<String,String> LIGATURES = makeMap(Tuple2("Æ","AE"), Tuple2("æ","ae"), Tuple2("Œ","OE"), Tuple2("œ","oe"), Tuple2("\uA75D", "rum"), Tuple2("\u204A", "et"), Tuple2("\uA76F", "con"), Tuple2("\uA749", "lis"), Tuple2("\uA753", "pro"), Tuple2("\uA770", "us"), Tuple2("\uA751", "per"), Tuple2("\uA757", "que"));
	public static final String LONG_S = "\u017F"; // ſ
	public static final Set<String> BANNED_CHARS = makeSet("@", "$", "%");
	/**
	 * Punctuation symbols that should be made available for any language, 
	 * regardless of whether they are seen in the language model training 
	 * material.
	 */
	public static final Set<String> UNIV_PUNC = makeSet("&", ".", ",", "[", "]", HYPHEN, "*", "§", "¶");

	private static boolean isPunctuation(char c) {
		return !Character.isWhitespace(c) && !Character.isAlphabetic(c) && !Character.isDigit(c);
	}
	public static boolean isPunctuationChar(String s) {
		for (char c: removeAnyDiacriticFromChar(s).toCharArray())
			if (!isPunctuation(c)) return false;
		return true;
	}
	
	public static final String GRAVE_COMBINING = "\u0300";
	public static final String ACUTE_COMBINING = "\u0301";
	public static final String CIRCUMFLEX_COMBINING = "\u0302";
	public static final String TILDE_COMBINING = "\u0303";
	public static final String MACRON_COMBINING = "\u0304"; // shorter overline
	public static final String BREVE_COMBINING = "\u0306";
	public static final String DIAERESIS_COMBINING = "\u0308"; // == umlaut
	public static final String CEDILLA_COMBINING = "\u0327";
	public static final String MACRON_BELOW_COMBINING = "\0331";

	private static boolean isCombiningChar(String c) {
		return (("\u0300".compareTo(c) <= 0 && c.compareTo("\u036F") <= 0) || 
				("\u1AB0".compareTo(c) <= 0 && c.compareTo("\u1AFF") <= 0) || 
				("\u1DC0".compareTo(c) <= 0 && c.compareTo("\u1DFF") <= 0) || 
				("\u20D0".compareTo(c) <= 0 && c.compareTo("\u20FF") <= 0) || 
				("\uFE20".compareTo(c) <= 0 && c.compareTo("\uFE2F") <= 0));
	}

	public static final String GRAVE_ESCAPE = "\\`";
	public static final String ACUTE_ESCAPE = "\\'";
	public static final String CIRCUMFLEX_ESCAPE = "\\^";
	public static final String TILDE_ESCAPE = "\\~";
	public static final String MACRON_ESCAPE = "\\-"; // shorter overline
	public static final String BREVE_ESCAPE = "\\v";
	public static final String DIAERESIS_ESCAPE = "\\\""; // == umlaut
	public static final String CEDILLA_ESCAPE = "\\c";
	public static final String MACRON_BELOW_ESCAPE = "\\_";

	private static final HashMap<String,String> COMBINING_TO_ESCAPE_MAP = new HashMap<String,String>();
	static {
		COMBINING_TO_ESCAPE_MAP.put(GRAVE_COMBINING, GRAVE_ESCAPE);
		COMBINING_TO_ESCAPE_MAP.put(ACUTE_COMBINING, ACUTE_ESCAPE);
		COMBINING_TO_ESCAPE_MAP.put(CIRCUMFLEX_COMBINING, CIRCUMFLEX_ESCAPE);
		COMBINING_TO_ESCAPE_MAP.put(TILDE_COMBINING, TILDE_ESCAPE);
		COMBINING_TO_ESCAPE_MAP.put(MACRON_COMBINING, MACRON_ESCAPE);
		COMBINING_TO_ESCAPE_MAP.put(BREVE_COMBINING, BREVE_ESCAPE);
		COMBINING_TO_ESCAPE_MAP.put(DIAERESIS_COMBINING, DIAERESIS_ESCAPE);
		COMBINING_TO_ESCAPE_MAP.put(CEDILLA_COMBINING, CEDILLA_ESCAPE);
		COMBINING_TO_ESCAPE_MAP.put(MACRON_BELOW_COMBINING, MACRON_BELOW_ESCAPE);
	}
	
//	private static String combiningToEscape(String combiningChar) {
//		String escape = COMBINING_TO_ESCAPE_MAP.get(combiningChar);
//		if (escape != null)
//			return escape;
//		else
//			throw new RuntimeException("Unrecognized combining char: [" + combiningChar + "] (" + StringHelper.toUnicode(combiningChar) + ")");
//	}

	private static String escapeToCombining(String escSeq) {
		if (GRAVE_ESCAPE.equals(escSeq))
			return GRAVE_COMBINING;
		else if (ACUTE_ESCAPE.equals(escSeq))
			return ACUTE_COMBINING;
		else if (CIRCUMFLEX_ESCAPE.equals(escSeq))
			return CIRCUMFLEX_COMBINING;
		else if (TILDE_ESCAPE.equals(escSeq))
			return TILDE_COMBINING;
		else if (MACRON_ESCAPE.equals(escSeq))
			return MACRON_COMBINING;
		else if (BREVE_ESCAPE.equals(escSeq))
			return BREVE_COMBINING;
		else if (DIAERESIS_ESCAPE.equals(escSeq))
			return DIAERESIS_COMBINING;
		else if (CEDILLA_ESCAPE.equals(escSeq))
			return CEDILLA_COMBINING;
		else if (MACRON_BELOW_ESCAPE.equals(escSeq))
			return MACRON_BELOW_COMBINING;
		else
			throw new RuntimeException("Unrecognized escape sequence: [" + escSeq + "]");
	}

	private static final Map<String, String> PRECOMPOSED_TO_ESCAPED_MAP = new HashMap<String, String>();
	static {
		PRECOMPOSED_TO_ESCAPED_MAP.put("à", "\\`a"); // \`a
		PRECOMPOSED_TO_ESCAPED_MAP.put("á", "\\'a"); // \'a
		PRECOMPOSED_TO_ESCAPED_MAP.put("â", "\\^a"); // \^a
		PRECOMPOSED_TO_ESCAPED_MAP.put("ä", "\\\"a"); // \"a
		PRECOMPOSED_TO_ESCAPED_MAP.put("ã", "\\~a"); // \~a
		PRECOMPOSED_TO_ESCAPED_MAP.put("ā", "\\-a"); // \-a
		PRECOMPOSED_TO_ESCAPED_MAP.put("ă", "\\va"); // \va

		PRECOMPOSED_TO_ESCAPED_MAP.put("è", "\\`e"); // \`e
		PRECOMPOSED_TO_ESCAPED_MAP.put("é", "\\'e"); // \'e
		PRECOMPOSED_TO_ESCAPED_MAP.put("ê", "\\^e"); // \^e
		PRECOMPOSED_TO_ESCAPED_MAP.put("ë", "\\\"e"); // \"e
		PRECOMPOSED_TO_ESCAPED_MAP.put("ẽ", "\\~e"); // \~e
		PRECOMPOSED_TO_ESCAPED_MAP.put("ē", "\\-e"); // \-e
		PRECOMPOSED_TO_ESCAPED_MAP.put("ĕ", "\\ve"); // \ve

		PRECOMPOSED_TO_ESCAPED_MAP.put("ì", "\\`i"); // \`i
		PRECOMPOSED_TO_ESCAPED_MAP.put("í", "\\'i"); // \'i
		PRECOMPOSED_TO_ESCAPED_MAP.put("î", "\\^i"); // \^i
		PRECOMPOSED_TO_ESCAPED_MAP.put("ï", "\\\"i"); // \"i
		PRECOMPOSED_TO_ESCAPED_MAP.put("ĩ", "\\~i"); // \~i
		PRECOMPOSED_TO_ESCAPED_MAP.put("ī", "\\-i"); // \-i
		PRECOMPOSED_TO_ESCAPED_MAP.put("ĭ", "\\vi"); // \vi
		//PRECOMPOSED_TO_ESCAPED_MAP.put("ı", "\\ii"); // \ii

		PRECOMPOSED_TO_ESCAPED_MAP.put("ò", "\\`o"); // \`o
		PRECOMPOSED_TO_ESCAPED_MAP.put("ó", "\\'o"); // \'o
		PRECOMPOSED_TO_ESCAPED_MAP.put("ô", "\\^o"); // \^o
		PRECOMPOSED_TO_ESCAPED_MAP.put("ö", "\\\"o"); // \"o
		PRECOMPOSED_TO_ESCAPED_MAP.put("õ", "\\~o"); // \~o
		PRECOMPOSED_TO_ESCAPED_MAP.put("ō", "\\-o"); // \-o
		PRECOMPOSED_TO_ESCAPED_MAP.put("ŏ", "\\vo"); // \vo

		PRECOMPOSED_TO_ESCAPED_MAP.put("ù", "\\`u"); // \`u
		PRECOMPOSED_TO_ESCAPED_MAP.put("ú", "\\'u"); // \'u
		PRECOMPOSED_TO_ESCAPED_MAP.put("û", "\\^u"); // \^u
		PRECOMPOSED_TO_ESCAPED_MAP.put("ü", "\\\"u"); // \"u
		PRECOMPOSED_TO_ESCAPED_MAP.put("ũ", "\\~u"); // \~u
		PRECOMPOSED_TO_ESCAPED_MAP.put("ū", "\\-u"); // \-u
		PRECOMPOSED_TO_ESCAPED_MAP.put("ŭ", "\\vu"); // \vu

		PRECOMPOSED_TO_ESCAPED_MAP.put("ñ", "\\~n"); // \~n
		PRECOMPOSED_TO_ESCAPED_MAP.put("ç", "\\cc"); // \cc

		PRECOMPOSED_TO_ESCAPED_MAP.put("À", "\\`A"); // \`A
		PRECOMPOSED_TO_ESCAPED_MAP.put("Á", "\\'A"); // \'A
		PRECOMPOSED_TO_ESCAPED_MAP.put("Â", "\\^A"); // \^A
		PRECOMPOSED_TO_ESCAPED_MAP.put("Ä", "\\\"A"); // \"A
		PRECOMPOSED_TO_ESCAPED_MAP.put("Ã", "\\~A"); // \~A
		PRECOMPOSED_TO_ESCAPED_MAP.put("Ā", "\\-A"); // \-A
		PRECOMPOSED_TO_ESCAPED_MAP.put("Ă", "\\vA"); // \vA

		PRECOMPOSED_TO_ESCAPED_MAP.put("È", "\\`E"); // \`E
		PRECOMPOSED_TO_ESCAPED_MAP.put("É", "\\'E"); // \'E
		PRECOMPOSED_TO_ESCAPED_MAP.put("Ê", "\\^E"); // \^E
		PRECOMPOSED_TO_ESCAPED_MAP.put("Ë", "\\\"E"); // \"E
		PRECOMPOSED_TO_ESCAPED_MAP.put("Ẽ", "\\~E"); // \~E
		PRECOMPOSED_TO_ESCAPED_MAP.put("Ē", "\\-E"); // \-E
		PRECOMPOSED_TO_ESCAPED_MAP.put("Ĕ", "\\vE"); // \ve

		PRECOMPOSED_TO_ESCAPED_MAP.put("Ì", "\\`I"); // \`I
		PRECOMPOSED_TO_ESCAPED_MAP.put("Í", "\\'I"); // \'I
		PRECOMPOSED_TO_ESCAPED_MAP.put("Î", "\\^I"); // \^I
		PRECOMPOSED_TO_ESCAPED_MAP.put("Ï", "\\\"I"); // \"I
		PRECOMPOSED_TO_ESCAPED_MAP.put("Ĩ", "\\~I"); // \~I
		PRECOMPOSED_TO_ESCAPED_MAP.put("Ī", "\\-I"); // \-I
		PRECOMPOSED_TO_ESCAPED_MAP.put("Ĭ", "\\vI"); // \vI

		PRECOMPOSED_TO_ESCAPED_MAP.put("Ò", "\\`O"); // \`O
		PRECOMPOSED_TO_ESCAPED_MAP.put("Ó", "\\'O"); // \'O
		PRECOMPOSED_TO_ESCAPED_MAP.put("Ô", "\\^O"); // \^O
		PRECOMPOSED_TO_ESCAPED_MAP.put("Ö", "\\\"O"); // \"O
		PRECOMPOSED_TO_ESCAPED_MAP.put("Õ", "\\~O"); // \~O
		PRECOMPOSED_TO_ESCAPED_MAP.put("Ō", "\\-O"); // \-O
		PRECOMPOSED_TO_ESCAPED_MAP.put("Ŏ", "\\vO"); // \vO

		PRECOMPOSED_TO_ESCAPED_MAP.put("Ù", "\\`U"); // \`U
		PRECOMPOSED_TO_ESCAPED_MAP.put("Ú", "\\'U"); // \'U
		PRECOMPOSED_TO_ESCAPED_MAP.put("Û", "\\^U"); // \^U
		PRECOMPOSED_TO_ESCAPED_MAP.put("Ü", "\\\"U"); // \"U
		PRECOMPOSED_TO_ESCAPED_MAP.put("Ũ", "\\~U"); // \~U
		PRECOMPOSED_TO_ESCAPED_MAP.put("Ū", "\\-U"); // \-U
		PRECOMPOSED_TO_ESCAPED_MAP.put("Ŭ", "\\vU"); // \vU

		PRECOMPOSED_TO_ESCAPED_MAP.put("Ñ", "\\~N"); // \~N
		PRECOMPOSED_TO_ESCAPED_MAP.put("Ç", "\\cC"); // \cC

		// note: superscript is marked \s as in superscript o = \so and superscript r is \sr
		//note for "breve" (u over letter) mark \va
	}

	private static final Map<String, String> PRECOMPOSED_TO_COMBINED_MAP = new HashMap<String, String>();
	static {
		for (Map.Entry<String, String> entry : PRECOMPOSED_TO_ESCAPED_MAP.entrySet()) {
			String value = entry.getValue();
			String baseChar = value.substring(value.length() - 1);
			String escapeCodes = value.substring(0, value.length() - 1);
			if (escapeCodes.length() % 2 != 0) throw new RuntimeException("problem with precomposed mapping: " + value);
			StringBuilder baseWithCombining = new StringBuilder(baseChar);
			for (int i = escapeCodes.length() - 2; i >= 0; i -= 2)
				baseWithCombining.append(escapeToCombining(escapeCodes.substring(i, i + 2)));
			PRECOMPOSED_TO_COMBINED_MAP.put(entry.getKey(), baseWithCombining.toString());
		}
	}
	
	private static final Map<String, String> COMBINED_TO_PRECOMPOSED_MAP = new HashMap<String, String>();
	static {
		for (Map.Entry<String, String> entry : PRECOMPOSED_TO_COMBINED_MAP.entrySet()) {
			COMBINED_TO_PRECOMPOSED_MAP.put(entry.getValue(), entry.getKey());
		}
	}
	
	public static final Set<String> CHARS_THAT_CAN_BE_REPLACED = setUnion(LOWERCASE_LATIN_LETTERS, makeSet("ç")); // TODO: Change this?
	public static final Set<String> VALID_CHAR_SUBSTITUTIONS = setUnion(LOWERCASE_LATIN_LETTERS, SPECIALS); // TODO: Change this?
	public static final Set<String> CHARS_THAT_CAN_DOUBLED = LOWERCASE_LATIN_LETTERS; // TODO: Change this?
	public static final Set<String> CHARS_THAT_CAN_BE_DECORATED_WITH_AN_ELISION_TILDE = LOWERCASE_LATIN_LETTERS; // TODO: Change this?
	public static final Set<String> CHARS_THAT_CAN_BE_ELIDED = LOWERCASE_LATIN_LETTERS; // TODO: Change this?
	private static final Set<String> COMBINING_DIACRITICS_THAT_CAN_BE_DISREGARDED = makeSet(GRAVE_COMBINING, ACUTE_COMBINING);
	public static final Set<String> LETTERS_WITH_DISREGARDEDABLE_DIACRITICS = LOWERCASE_VOWELS;
	
	public static Set<Integer> makePunctSet(Indexer<String> charIndexer) {
		Set<Integer> punctSet = new HashSet<Integer>();
		for (String c : charIndexer.getObjects()) {
			if (isPunctuationChar(c))
				punctSet.add(charIndexer.getIndex(c));
		}
		return punctSet;
	}
	public static Set<Integer> makeCanBeReplacedSet(Indexer<String> charIndexer) {
		Set<Integer> canBeReplaced = new HashSet<Integer>();
		for (String c : charIndexer.getObjects()) {
			if (CHARS_THAT_CAN_BE_REPLACED.contains(c))
				canBeReplaced.add(charIndexer.getIndex(c));
		}
		return canBeReplaced;
	}
	public static Set<Integer> makeValidSubstitutionCharsSet(Indexer<String> charIndexer) {
		Set<Integer> validSubstitutionChars = new HashSet<Integer>();
		for (String c : charIndexer.getObjects()) {
			if (VALID_CHAR_SUBSTITUTIONS.contains(c))
				validSubstitutionChars.add(charIndexer.getIndex(c));
		}
		return validSubstitutionChars;
	}
	public static Set<Integer> makeValidDoublableSet(Indexer<String> charIndexer) {
		Set<Integer> validDoublableChars = new HashSet<Integer>();
		for (String c : charIndexer.getObjects()) {
			if (CHARS_THAT_CAN_DOUBLED.contains(c))
				validDoublableChars.add(charIndexer.getIndex(c));
		}
		return validDoublableChars;
	}
	public static Set<Integer> makeCanBeElidedSet(Indexer<String> charIndexer) {
		Set<Integer> canBeElided = new HashSet<Integer>();
		for (String c : charIndexer.getObjects()) {
			if (CHARS_THAT_CAN_BE_ELIDED.contains(c))
				canBeElided.add(charIndexer.getIndex(c));
		}
		return canBeElided;
	}
	public static Map<Integer,Integer> makeAddTildeMap(Indexer<String> charIndexer) {
		Map<Integer,Integer> m = new HashMap<Integer, Integer>();
		for (String original : charIndexer.getObjects()) {
			Tuple2<String,List<String>> originalLetterAndCombiningDiacritics = normalizeCharSeparateDiacritics(original);
			String baseLetter = originalLetterAndCombiningDiacritics._1;
			if (CHARS_THAT_CAN_BE_DECORATED_WITH_AN_ELISION_TILDE.contains(original)) {
					m.put(charIndexer.getIndex(original), charIndexer.getIndex(addTilde(baseLetter)));
			}
			else if (LETTERS_WITH_DISREGARDEDABLE_DIACRITICS.contains(baseLetter)) {
				for (String diacritic : originalLetterAndCombiningDiacritics._2) {
					if (COMBINING_DIACRITICS_THAT_CAN_BE_DISREGARDED.contains(diacritic)) {
						m.put(charIndexer.getIndex(original), charIndexer.getIndex(addTilde(baseLetter)));
						break;
					}
				}
			}
		}
		return m;
	}
	public static Map<Integer,List<Integer>> makeLigatureMap(Indexer<String> charIndexer) {
		Map<Integer,List<Integer>> m = new HashMap<Integer, List<Integer>>();
		for (Map.Entry<String,String> entry : LIGATURES.entrySet()) {
			List<String> ligature = readNormalizeCharacters(entry.getKey());
			if (ligature.size() > 1) throw new RuntimeException("Ligature ["+entry.getKey()+"] has more than one character: "+ligature);
			List<Integer> l = new ArrayList<Integer>();
			for (String c : readNormalizeCharacters(entry.getValue()))
				l.add(charIndexer.getIndex(c));
			m.put(charIndexer.getIndex(ligature.get(0)), l);
		}
		return m;
	}
	public static Map<Integer,Integer> makeDiacriticDisregardMap(Indexer<String> charIndexer) {
		Map<Integer,Integer> m = new HashMap<Integer,Integer>();
		for (String original : charIndexer.getObjects()) { // find accented letters
			Tuple2<String,List<String>> originalLetterAndCombiningDiacritics = normalizeCharSeparateDiacritics(original);
			String baseLetter = originalLetterAndCombiningDiacritics._1;
			if (LETTERS_WITH_DISREGARDEDABLE_DIACRITICS.contains(baseLetter)) {
				for (String diacritic : originalLetterAndCombiningDiacritics._2) {
					if (COMBINING_DIACRITICS_THAT_CAN_BE_DISREGARDED.contains(diacritic)) {
						m.put(charIndexer.getIndex(original), charIndexer.getIndex(baseLetter));
						break;
					}
				}
			}
		}
		return m;
	}
	
	public static String addTilde(String c) {
		return normalizeChar(c + TILDE_COMBINING);
	}
	
	/**
	 * Get the character code including any escaped diacritics that precede 
	 * the letter and any unicode "combining characters" that follow it.
	 * 
	 * Precomposed accents are given the highest priority.  Combining characters 
	 * are interpreted as left-associative and high-priority, while escapes are 
	 * right-associative and low-priority.  So, for a letter x with precomposed
	 * diacritic 0, combining chars 1,2,3, and escapes 4,5,6, the input 654x123 
	 * becomes encoded (with escapes) as 6543210x, and decoded (with precomposed 
	 * and combining characters) as x01234656.
	 * 
	 * @param c	A single character, potentially with diacritics encoded in any 
	 * form (composed, precomposed, escaped).
	 * @return	A string representing a single fully-escaped character, with all 
	 * diacritics (combining and precomposed) converted to their equivalent escape 
	 * sequences.
	 * @throws RuntimeException if the parameter `s` does not represent a single
	 * (potentially composed or escaped) character.
	 */
	public static String normalizeChar(String c) {
		Tuple2<String, List<String>> letterAndDiacritics = normalizeCharSeparateDiacritics(c);
		return letterAndDiacritics._1 + StringHelper.join(letterAndDiacritics._2);
	}

	/**
	 * @see edu.berkeley.cs.nlp.ocular.data.textreader.textreader.Charset.normalizeChar
	 * 
	 * @param c	A single character, potentially with diacritics encoded in any 
	 * form (composed, precomposed, escaped).
	 * @return	A fully-normalized character, with all diacritics (combining and 
	 * precomposed) converted to their equivalent normalized forms and placed in
	 * a list to be returned with the bare letter.
	 * @throws RuntimeException if the parameter `s` does not represent a single
	 * (potentially composed or escaped) character.
	 */
	public static Tuple2<String,List<String>> normalizeCharSeparateDiacritics(String c) {
		Tuple3<String, List<String>, Integer> letterAndLength = readLetterAndNormalDiacriticsAt(c, 0);
		int length = letterAndLength._3;
		if (c.length() != length) throw new RuntimeException("Could not escape ["+c+"] because it contains more than one character ("+StringHelper.toUnicode(c)+")");
		return Tuple2(letterAndLength._1, letterAndLength._2);
	}

	/**
	 * Read a single character from the line, starting at the given offset.
	 * 
	 * @see edu.berkeley.cs.nlp.ocular.data.textreader.textreader.Charset.normalizeChar
	 * 
	 * @param line	A line of text possibly containing characters with diacritics
	 * composed, precomposed, or escaped.
	 * @param offset	The offset point in `line` from which to start reading for a 
	 * character.
	 * @return	A fully-normalized character string, with all diacritics (combining
	 * and precomposed) converted to their equivalent combining forms.  Also 
	 * return the length in the ORIGINAL string of the span used to produce this 
	 * normalized character (to use as an offset when scanning through the string).
	 */
	private static Tuple2<String, Integer> readNormalizeCharAt(String line, int offset) {
		Tuple3<String, List<String>, Integer> result = readLetterAndNormalDiacriticsAt(line, offset);
		String c = result._1 + StringHelper.join(result._2);
		int length = result._3;
		return Tuple2(c, length);
	}
	
	/**
	 * Read a single character from the line including a list of all its diacritics, 
	 * starting at the given offset.
	 * 
	 * @see edu.berkeley.cs.nlp.ocular.data.textreader.textreader.Charset.normalizeChar
	 * 
	 * @param line	A line of text possibly containing characters with diacritics
	 * composed, precomposed, or normalized.
	 * @param offset	The offset point in `line` from which to start reading for a 
	 * character.
	 * @return	A fully-normalized character, with all diacritics (combining and 
	 * precomposed) converted to their equivalent combining forms and put in a list,
	 * the base letter with all diacritics removed, and the length in the ORIGINAL 
	 * string of the span used to produce this normalized character (to use as an 
	 * offset when scanning through the string).
	 */
	private static Tuple3<String, List<String>, Integer> readLetterAndNormalDiacriticsAt(String line, int offset) {
		int lineLen = line.length();
		if (offset >= lineLen) throw new RuntimeException("offset must be less than the line length");
		
		if (lineLen - offset >= 2 && line.substring(offset, offset + 2).equals("\\\\"))
			return Tuple3("\\\\", (List<String>)new ArrayList<String>(), 2); // "\\" is its own character (for "\"), not an escaped diacritic
		
		List<String> escapeDiacritics = new ArrayList<String>(); // in reversed order!
		List<String> combiningDiacritics = new ArrayList<String>();

		// get any escape prefixes characters
		int i = offset;
		while (i < lineLen && line.charAt(i) == '\\') {
			if (i + 1 >= lineLen) throw new RuntimeException("expected more after escape symbol, but found nothing: " + i + "," + lineLen + " " + line.substring(Math.max(0, i - 10), i) + "[" + line.substring(i) + "]");
			String escape = line.substring(i, i + 2);
			escapeDiacritics.add(0, escape);
			i += 2; // accept the 2-character escape sequence
		}

		if (i >= lineLen) throw new RuntimeException("expected a letter after escape code, but found nothing: " + i + "," + lineLen + " " + line.substring(Math.max(0, i - 50), i) + "[" + line.substring(i) + "]");
		String letter = String.valueOf(line.charAt(i));
		if (isCombiningChar(letter)) throw new RuntimeException("found unexpected combining char: " + i + "," + lineLen + " " + line.substring(Math.max(0, i - 50), i) + "[" + line.substring(i) + "]");
		i += 1; // accept the letter itself

		// get any combining characters
		while (i < lineLen) {
			String next = line.substring(i, i + 1);
			if (!isCombiningChar(next)) break;
			combiningDiacritics.add(next);
			i++; // accept the combining character
		}

		String deprecomposedChar = PRECOMPOSED_TO_COMBINED_MAP.get(letter);
		String letterOnly;
		if (deprecomposedChar == null) {
			letterOnly = letter;
		}
		else {
			letterOnly = String.valueOf(deprecomposedChar.charAt(0));
			for (int j = 1; j < deprecomposedChar.length(); ++j)
				combiningDiacritics.add(0, String.valueOf(deprecomposedChar.charAt(j)));
		}
		
		for (String diacritic : escapeDiacritics) {
			if (diacritic.equals("\\i")) {
				if (!letterOnly.equals("i")) throw new RuntimeException("the \\i escape sequence can only be used on the character 'i' (to indicate a no-dot i)");
				letterOnly = "ı";
			}
			else {
				combiningDiacritics.add(escapeToCombining(diacritic));
			}
		}
		
		if (letterOnly.length() != 1) throw new RuntimeException("base letter should be length 1, found: " + letterOnly);
		if (!combiningDiacritics.isEmpty()) {
			char letterChar = letterOnly.charAt(0);
			if (!(Character.isAlphabetic(letterChar))) 
				throw new RuntimeException("because there were diacritics, letter is expected, but something else was found: " + i + "," + lineLen + " " + line.substring(Math.max(0, i - 50), i) + "[" + line.substring(i) + "]");
		}
		
		return Tuple3(letterOnly, combiningDiacritics, i - offset);
	}

	/**
	 * Convert a string into a sequence of diacritic-normalized characters.
	 * 
	 * @see edu.berkeley.cs.nlp.ocular.data.textreader.textreader.Charset.normalizeChar
	 * 
	 * @param line	A line of text possibly containing characters with diacritics
	 * composed, precomposed, or escaped.
	 * @return	A fully-normalized character string, with all diacritics (combining
	 * and precomposed) converted to their equivalent combining chars.
	 */
	public static List<String> readNormalizeCharacters(String line) {
		List<String> normalizedChars = new ArrayList<String>();
		int i = 0;
		while (i < line.length()) {
			Tuple2<String, Integer> normalizedCharAndLength = readNormalizeCharAt(line, i);
			String c = normalizedCharAndLength._1;
			int length = normalizedCharAndLength._2;
			normalizedChars.add(c);
			i += length; // advance to the next character
		}
		return normalizedChars;
	}
	
	/**
	 * Convert character into unicode precomposed and combining characters
	 */
	public static String unescapeChar(String c, boolean precomposedOnly) {
		if (c.equals("\\\\")) return "\\";
		
		Tuple2<String,List<String>> letterAndNormalDiacritics = normalizeCharSeparateDiacritics(c); // use combining chars only (and make sure it's a valid character)
		String baseLetter = letterAndNormalDiacritics._1;
		List<String> diacritics = letterAndNormalDiacritics._2;
		
		if (diacritics.isEmpty()) return baseLetter;
		
		StringBuilder b = new StringBuilder();
		
		// Attempt to make a precomposed letter, falling back to composed otherwise
		String firstDiacritic = diacritics.get(0);
		String precomposed = COMBINED_TO_PRECOMPOSED_MAP.get(baseLetter + firstDiacritic); 
		if (precomposed != null)
			b.append(precomposed);
		else {
			b.append(baseLetter);
			if (!precomposedOnly) b.append(firstDiacritic);
		}

		if (precomposedOnly) {
			// Handle the rest of the diacritics
			for (int i = (precomposed != null ? 1 : 0); i < diacritics.size(); ++i) {
				String escape = COMBINING_TO_ESCAPE_MAP.get(diacritics.get(i));
				if (escape != null)
					b.insert(0, escape);
				else
					b.append(StringHelper.toUnicode(diacritics.get(i)));
			}
		}
		else {
			// Handle the rest of the diacritics
			for (int i = 1; i < diacritics.size(); ++i) {
				b.append(diacritics.get(i));
			}
		}
		
		return b.toString();
	}

	/**
	 * Convert character into unicode precomposed and combining characters
	 */
	public static String unescapeChar(String c) {
		return unescapeChar(c, false);
	}

	/**
	 * Convert character into a base character and explicit escape sequences
	 */
	public static String fullyEscapeChar(String c) {
		if (c.equals("\\\\")) return c;
		
		Tuple2<String,List<String>> letterAndNormalDiacritics = normalizeCharSeparateDiacritics(c); // use combining chars only (and make sure it's a valid character)
		String baseLetter = letterAndNormalDiacritics._1;
		List<String> diacritics = letterAndNormalDiacritics._2;
		if (baseLetter.equals("ı"))
			baseLetter = "\\ii";
		
		if (diacritics.isEmpty()) return baseLetter;
		
		StringBuilder b = new StringBuilder(baseLetter);

		// Handle the rest of the diacritics
		for (int i = 0; i < diacritics.size(); ++i) {
			String escape = COMBINING_TO_ESCAPE_MAP.get(diacritics.get(i));
			if (escape != null)
				b.insert(0, escape);
			else
				b.append(StringHelper.toUnicode(diacritics.get(i)));
		}
		
		return b.toString();
	}

	public static String removeAnyDiacriticFromChar(String c) {
		return normalizeCharSeparateDiacritics(c)._1;
	}

}