字符串排序

键索引记数法

将String[]分成R组，统计每组字符串个数，由此计算出每组的首个字符串在排序后的位置后，即可得到每个字符串排序后的下标。

该方法有一个特点：排序后，同一组内的字符串排序前的相对位置不变，正是这一点使得其可以用于低位优先字符串排序

低位优先的字符串排序

前提条件：每个字符串等长
所谓低位优先，就是从右往左排序，先排最右边的

假设字符串长度W，从右往左，根据字符串当前位置的字符作为组号分成R组，使用键索引记数法完成W次排序即可得到这样有序的数组：
优先比较最左边元素，最左边元素相同，比较后面的元素

public class LSD {
	public LSD() {
	}

	public static void sort(String[] a, int W) {
		// 通过前W个字符将a[]排序
		int N = a.length;
		String aux[] = new String[N];
		int R = 256;// ASCII字符集的字符数量，8位的ASCII字符集
		for (int d = W - 1; d >= 0; d--) {
			int count[] = new int[R + 1];

			// 统计频率
			for (int i = 0; i < N; i++)
				count[a[i].charAt(d) + 1]++;
			// 将频率转换为索引
			for (int r = 0; r < R; r++)
				count[r + 1] += count[r];
			// 将元素分类
			for (int i = 0; i < N; i++)
				aux[count[a[i].charAt(d)]++] = a[i];
			// 回写
			for (int i = 0; i < N; i++)
				a[i] = aux[i];
		}

	}

	public static void main(String[] args) {
		String a[] = { "4PGC938", "2IYE230", "3CIO720", "1ICK750", "1OHV845" };
		int W = 7;
		sort(a, W);
		for (int i = 0; i < a.length; i++)
			System.out.println(a[i]);

	}

}

高位优先的字符串排序

这种排序适合排序不同长度的字符串

首先从每个字符串最左边的第一个字符作为分组标志，使用键索引记数法处理完成排序，之后针对排序后的每一个分组，对后面的字符递归进行一样的操作

类似快排的思想，但是这里的递归不是把数组分成两部分递归，而是R部分递归

package Number_5;

/**
 * P462 算法5.2 高位优先的字符串排序（从左向右，字符串长度不一定相同）
 * 对于含大量等值键的子数组排序会比较慢且不适合很长字符串
 * 
 * @author he
 *
 */
public class MSD {
	private static int R = 256;// 8为ASCII表中的字符数量
	private static final int M = 15;// 小数组的切分
	private static String aux[];// 辅助数组

	public MSD() {

	}

	private static int charAt(String s, int d) {
		if (d < s.length())
			return s.charAt(d);
		else {
			return -1;
		}
	}

	public static void sort(String a[]) {
		int N = a.length;
		aux = new String[N];
		sort(a, 0, N - 1, 0);
	}

	private static void sort(String a[], int lo, int hi, int d) {
		// 对于小于一定长度的数组进行插入排序
		if (hi <= lo + M) {
			insertion(a, lo, hi, d);
			return;
		}

		int count[] = new int[R + 2];// count[0]不保存东西，count[1]保存的是长度不大于d的字符串数量
		// 计算每个字符串位置d处的字符出现的频率
		for (int i = lo; i <= hi; i++) {
			int t = charAt(a[i], d);
			count[t + 2]++;
		}

		// 转换为索引
		for (int r = 0; r < R + 1; r++)
			count[r + 1] += count[r];

		// 分类
		for (int i = lo; i <= hi; i++) {
			int t = charAt(a[i], d);
			aux[count[t + 1]++] = a[i];
		}

		for (int i = lo; i <= hi; i++)
			a[i] = aux[i - lo];

		// 递归每个子字符串组
		for (int r = 0; r < R; r++)
			sort(a, lo + count[r], lo + count[r + 1] - 1, d + 1);

	}

	// 对字符串组插入排序
	private static void insertion(String a[], int lo, int hi, int d) {
		for (int i = lo; i <= hi; i++)
			for (int j = i; j > lo && less(a[j], a[j - 1], d); j--)
				exch(a, j, j - 1);
	}

	private static boolean less(String w, String v, int d) {
		for (int i = d; i < Math.min(w.length(), v.length()); i++) {
			if (w.charAt(i) < v.charAt(i))
				return true;
			if (w.charAt(i) > v.charAt(i)) {
				return false;
			}
		}

		return w.length() < v.length();
	}

	private static void exch(String a[], int i, int j) {
		String temp = a[i];
		a[i] = a[j];
		a[j] = temp;

	}

	public static void main(String[] args) {
		String[] a = { "by", "are", "seashells", "seashells", "sells" };
		sort(a);
		for (int i = 0; i < a.length; i++)
			System.out.println(a[i]);

	}

}

三向字符串快速排序

高位优先的字符串排序时，每次根据字符串当前字符的不同，分成R组，而三向排序只分为三组。大于指定字符的，等于指定字符的，小于指定字符的。
只有等于指定字符的，继续向后递归。
不等于指定字符的，继续在当前字符位置递归。

适合于String[]中重复字符串密集的情况

/**
 * P469 算法5.3 三向字符串快速排序 适合含有大量等值键、有较长公共前缀的键、取值范围较小的键和小数组
 */
public class Quick3string {

	private static final int M = 0;// 切分数组的大小

	public Quick3string() {
	}

	private static int chatAt(String s, int d) {
		if (d < s.length())
			return s.charAt(d);
		else
			return -1;
	}

	public static void sort(String a[]) {
		StdRandom.shuffle(a);// 消除对输入的依赖
		sort(a, 0, a.length - 1, 0);
	}

	private static void sort(String a[], int lo, int hi, int d) {
		// 对于小数组使用插入排序
		if (hi <= lo + M) {
			Insterion(a, lo, hi, d);
			return;
		}
		
		// 以String[]第d个字符作为Key做一次快排切分
		int lt = lo, gt = hi, i = lo + 1;
		int v = chatAt(a[lo], d);

		while (i <= gt) {
			int t = chatAt(a[i], d);
			if (t < v)
				exch(a, lt++, i++);
			else if (t > v)
				exch(a, i, gt--);
			else
				i++;
		}
		
		// 递归，先排序小于string[lo]的字符串，再排序等于String[lo]的字符串，最后排序大于string[lo]的字符串
		sort(a, lo, lt - 1, d);		// 递归小于string[lo]的Strings的第d位
		if (v > 0)
			sort(a, lt, gt, d + 1);		// 第d位等于String[lo]的Strings递归下一位
		sort(a, gt + 1, hi, d);		// 递归大于string[lo]的Strings的第d位
	}

	// 插入排序
	private static void Insterion(String a[], int lo, int hi, int d) {
		for (int i = lo; i <= hi; i++)
			for (int j = i; j > 0 && less(a[j], a[j - 1], d); j--)
				exch(a, j, j - 1);
	}

	private static boolean less(String s, String v, int d) {
		for (int i = d; i < Math.min(s.length(), v.length()); i++) {
			if (s.charAt(i) < v.charAt(i))
				return true;
			if (s.charAt(i) > v.charAt(i))
				return false;
		}
		return s.length() < v.length();
	}

	private static void exch(String a[], int i, int j) {
		String temp = a[i];
		a[i] = a[j];
		a[j] = temp;
	}

	public static void main(String[] args) {
		String a[] = { "com.cnn", "edu.uva.cs", "edu.uva.cs", "com.google" };
		sort(a);
		for (int i = 0; i < a.length; i++)
			System.out.println(a[i]);

	}

}

字符串查找

单词查找树

1、树的每个节点维护：
node[]，node数组的下标索引代表单词的一个字符
一个泛型变量val，当前节点表示的字符是单词末尾字符时非空
2、单词查找树的root节点不存在val

无论是增加单词还是查找单词还是匹配单词，从根节点出发，不断查找非空next[i]或创建next[i]

public class TrieST<Value> {
	private static final int R = 256;// 字母表的大小，8位的ASCII表中元素个数
	private Node root;// 单词查找树的根结点,root.val==null
	private int N;// 查找树中键的数量

	public TrieST() {
	}

	private static class Node {
		private Object val;// 键对应的值
		private Node next[] = new Node[R];// 保存指向其他Node对象的引用

	}

	public void put(String key, Value val) {
		root = put(root, key, val, 0);
	}

	// 把key中的第d个字符添加到单词查找树中
	private Node put(Node x, String key, Value val, int d) {
		if (x == null)
			x = new Node();
		// 将值保存到键的最后一个字符所在结点
		if (d == key.length()) {
			if (x.val == null)
				N++;
			x.val = val;
			return x;
		}
		char c = key.charAt(d);// 找到d个字符对应的子单词查找树
		x.next[c] = put(x.next[c], key, val, d + 1);
		return x;
	}

	@SuppressWarnings("unchecked")
	public Value get(String key) {
		Node x = get(root, key, 0);
		if (x == null)
			return null;
		return (Value) x.val;
	}
	
	// 单词查找树找到key，返回该key的最后一个字符所对应的node
	private Node get(Node x, String key, int d) {
		if (x == null)
			return null;
		if (d == key.length())
			return x;
		char c = key.charAt(d);// 找到d个字符所对应的子单词查找树
		return get(x.next[c], key, d + 1);
	}

	/*
	 * 返回单词查找树中所有的键
	 */
	public Iterable<String> keys() {
		return keysWithPrefix("");

	}

	/**
	 * 返回所有以pre为前缀的键 比如单词查找树中有“shells”,pre为"shel"则返回shells
	 */
	public Iterable<String> keysWithPrefix(String pre) {
		Queue<String> q = new Queue<String>();
		collect(get(root, pre, 0), pre, q);
		return q;
	}

	// 匹配以pre为前缀的键
	private void collect(Node x, String pre, Queue<String> q) {
		if (x == null)
			return;
		if (x.val != null)
			q.enqueue(pre);		// 找到单词，入队
		for (char c = 0; c < R; c++)
			collect(x.next[c], pre + c, q);		// 递归找当前节点之后的分支中存在的单词

	}

	/**
	 * 所有和pat匹配的键，其中“.”表示能够匹配任意字符 比如单词查找树中有“shells”,pre为"shel.."则返回shells
	 */
	public Iterable<String> keysThatMath(String pat) {
		Queue<String> q = new Queue<String>();
		collect(root, "", pat, q);
		if (q.size() == 0)
			throw new RuntimeException("未找到匹配项");
		return q;
	}

	/**
	 * @param pre:已经匹配了的字符串
	 * 将匹配pat模式的字符串加到q
	 */
	private void collect(Node x, String pre, String pat, Queue<String> q) {
		if (x == null)		// 未匹配到
			return;
		int d = pre.length();
		if (d == pat.length() && x.val != null)
			q.enqueue(pre);		// 将匹配pat成功的字符串入队，不ret吗？
		if (d == pat.length())		// 未匹配到
			return;
		char next = pat.charAt(d);
		for (char c = 0; c < R; c++)
			if (next == '.' || next == c)
				collect(x.next[c], pre + c, pat, q);
	}

	/**
	 * 返回s的前缀中最长的键,比如单词查找树中有"she"，则s="shell"，返回"she"
	 */
	public String longestPrefixOf(String s) {
		int length = search(root, s, 0, 0);// 单词查找树中与s匹配的key的最大长度
		return s.substring(0, length);

	}

	// 记录查找与s相关的路径上所找到的最长键的长度
	private int search(Node x, String s, int d, int length) {
		if (x == null)
			return length;		// 返回之前匹配的key的长度
		if (x.val != null)
			length = d;		// 更新长度
		if (d == s.length())		// 单词查找树中包含s
			return length;		// 如果单词树存在包含key的单词，返回key的长度

		char c = s.charAt(d);
		return search(x.next[c], s, d + 1, length);		// 继续查找后面的节点

	}

	public void delete(String key) {
		root = delete(root, key, 0);
	}

	private Node delete(Node x, String key, int d) {
		if (x == null)
			return null;
		if (d == key.length())
			x.val = null;		// 匹配到要删除的键
		else {
			char c = key.charAt(d);
			x.next[c] = delete(x.next[c], key, d + 1);		// 继续查找后面的节点
		}
		if (x.val != null)		// key路径上存在子key，不删除节点
			return x;
		for (char c = 0; c < R; c++)		// 删除key路径上的节点存在向下分支，不删除节点
			if (x.next[c] != null)
				return x;
		return null;		// 当前节点的val为null且不存在向下的分支，删除节点

	}

	public int size() {
		return N;
	}
}

三向单词查找树

单词查找树是将节点维护的node数组下标隐式代表字符，组成树
三向单词查找树，每个节点不维护包含所有字符下标的节点数组，只维护三个节点：left，mid，right
同时包含一个char表示当前节点代表的字符，如果当前字符等于char，访问mid，小于char，访问left，大于char，访问right
包含一个Value val，单词最后一个字符所代表节点的val非空，表示一个单词的完结

三向的效果不需要维护所有字符个数的数组，只维护三个子节点，添加、查找单词都和当前节点的char比较来决定子节点的选择，而不是单词查找树通过当前字符作为索引查找节点数组。

public class TST<Value> {
	private Node root;		// 根结点
	private int N;		//

	private class Node {
		char c;		// 每个结点中保存的字符
		Node left, mid, right;		// 每个节点只有左中右三个子节点
		Value val;
	}

	public void put(String key, Value val) {
		root = put(root, key, val, 0);
	}

	private Node put(Node x, String key, Value val, int d) {
		char c = key.charAt(d);
		if (x == null) {
			x = new Node();
			x.c = c;		// 如果访问到空节点，以当前字符作为中间字符
		}

		if (c < x.c)		// 未命中，根据比较当前节点的结果，key的当前字符不变，递归子节点
			x.left = put(x.left, key, val, d);
		else if (c > x.c)
			x.right = put(x.right, key, val, d);
		else if (d < key.length() - 1)		// 命中，但当前节点不是key最后一个字符
			x.mid = put(x.mid, key, val, d + 1);		// 命中，移到key的下一个字符，递归子节点
		else {		// 当前节点就是key最后一个字符表示的节点
			if (x.val == null)
				N++;
			x.val = val;
		}
		return x;
	}

	// 在三向单词树中查找key
	public Value get(String key) {
		if (key == null)
			throw new NullPointerException();
		if (key.length() == 0)
			throw new IllegalArgumentException("key must have length >= 1");
		Node x = get(root, key, 0);
		if (x == null)
			return null;
		return x.val;
	}

	private Node get(Node x, String key, int d) {
		if (x == null)
			return null;
		char c = key.charAt(d);
		if (c < x.c)		// 未命中，根据比较的结果访问子节点继续匹配当前字符
			return get(x.left, key, d);
		else if (c > x.c)
			return get(x.right, key, d);
		else if (d < key.length() - 1)
			return get(x.mid, key, d + 1);		// 命中，匹配下一个字符
		else
			return x;
	}

	public int size() {
		return N;
	}

	public boolean contains(String key) {
		return get(key) != null;
	}

	// s的子字符串表示的最长单词
	public String longestPrefixOf(String s) {
		if (s == null || s.length() == 0)
			return null;
		int length = 0;
		Node x = root;
		int i = 0;
		while (x != null && i < s.length()) {
			char c = s.charAt(i);
			if (c < x.c)
				x = x.left;
			else if (c > x.c)
				x = x.right;
			else {
				// 命中
				i++;
				if (x.val != null)
					length = i;		// 找到子s且是单词，记录单词长度
				x = x.mid;
			}
		}
		return s.substring(0, length);

	}

	public Iterable<String> keys() {
		Queue<String> queue = new Queue<String>();
		collect(root, new StringBuilder(), queue);
		return queue;
	}

	// 所有以s为前缀的键
	public Iterable<String> keysWithPrefix(String prefix) {
		Queue<String> queue = new Queue<String>();
		Node x = get(root, prefix, 0);
		if (x == null)
			return queue;
		if (x.val != null)
			queue.enqueue(prefix);
		collect(x.mid, new StringBuilder(prefix), queue);
		return queue;
	}

	private void collect(Node x, StringBuilder prefix, Queue<String> queue) {
		if (x == null)
			return;
		collect(x.left, prefix, queue);
		if (x.val != null)
			queue.enqueue(prefix.toString() + x.c);
		collect(x.mid, prefix.append(x.c), queue);
		prefix.deleteCharAt(prefix.length() - 1);// 将记录过的key删除，回溯到最初的首字母
		collect(x.right, prefix, queue);
	}

	// 所有和s匹配的键(其中"."能匹配任意字符)
	public Iterable<String> keysThatMatch(String pattern) {
		Queue<String> queue = new Queue<String>();
		collect(root, new StringBuilder(), 0, pattern, queue);
		return queue;
	}

	private void collect(Node x, StringBuilder prefix, int i, String pattern, Queue<String> queue) {
		if (x == null)
			return;
		char c = pattern.charAt(i);
		if (c == '.' || c < x.c)
			collect(x.left, prefix, i, pattern, queue);
		if (c == '.' || c == x.c) {
			if (i == pattern.length() - 1 && x.val != null)
				queue.enqueue(prefix.toString() + x.c);
			if (i < pattern.length() - 1) {
				collect(x.mid, prefix.append(x.c), i + 1, pattern, queue);
				prefix.deleteCharAt(prefix.length() - 1);
			}
		}
		if (c == '.' || c > x.c)
			collect(x.right, prefix, i, pattern, queue);
	}
}

子字符串查找

KMP

KMP的核心思想是要构造next数组。
next[i]的含义
target的第i个字符及其之前的字符组成的子字符串与从target头部开始匹配的字符个数
next数组的求法
初始化：next[0]=next[1]=0
next数组在字符串查找中，扮演了极其重要的角色。
- 当target[i]和source[j]不匹配时，不需要完全回退到target[0]重新开始比较，通过查找next数组，可以知道source[j-1]处已经匹配了多少个target字符，所以source[j]可以从其后的target字符开始比较，无需回退source，大大减少了重复比较的次数。
不用KMP的暴力比较，其时间复杂度是
O(source.length() * target.length())
KMP的时间复杂度是
O(source.length() + target.length())
此图帮助你理解KMP的思想：

private int[] next;
    public int strStr(String source, String target) {
        // 特殊情况处理
        if(source == null || target == null) {
            return -1;
        }
        if(target.equals("")) {
            return 0;
        }
        
        
        // 预处理target字符串，得到next数组
		kmpPreview(target);
		
		int sourceIndex = 0;
		int targetIndex = 0;
		while(sourceIndex != source.length()) {	// 如果访问完了source还未找到匹配，则不包含target
			//对应字符相等的处理
			if(source.charAt(sourceIndex) == target.charAt(targetIndex)) {
				// target匹配成功
				if(targetIndex == target.length()-1) {
					return sourceIndex - target.length() + 1;
				} else {
					sourceIndex++;
					targetIndex++;
				}
			} else {	// 对应字符不等的处理
				// 如果回退到target头部仍不相等，则放弃当前的进度，开始比较source的下一个字符
				if (targetIndex == 0) {
					sourceIndex++;
				} else {
					// 对应字符不等的处理，由next数组回退target下标，依然和当前的source对应的字符比较，利用了之前比较的进度
					targetIndex = next[targetIndex];
				}
			}
		}
		// 不包含target
		return -1;
    }
    
    /**
	 * @intention:构造next数组。next[i]表示s的第i个字符及其之前的字符组成的子字符串，与从s头开始完全匹配的字符个数
	 */
	private void kmpPreview(String s) {
		next = new int[s.length()+1];
		next[0] = 0;
		next[1] = 0;
		for(int i = 2; i <= s.length(); i++) {
			if(s.charAt(i-1) == s.charAt(next[i-1])) {
				next[i] = next[i-1] + 1;
			} else {
				next[i] = 0;
			}
		}
	}

BM

pat按照从右往左与txt比较
当出现对应位不匹配时，使txt[i]与pat中与其相同的字符对齐
1、如果不存在或者pat与其相同的字符在pat当前比较字符的右边，则将pat往后移动一位
2、如果在左边，移动pat使之与txt[i]对齐
继续从右往左比较
如果比完了所有的pat，则找到匹配
如果pat移出了txt的右边界，则匹配失败

public class BoyerMoore {
	private int right[];
	private String pat;

	public BoyerMoore(String pat) {
		this.pat = pat;
		int R = 256;// 八位ASCII字符表对应的字符数量
		int M = pat.length();
		right = new int[R];

		for (char c = 0; c < R; c++)
			right[c] = -1; // 文本中不包含在模式字符串中的字符的为-1，原因见skip=j-right[txt.charAt(i+j)]的注释
		for (int j = 0; j < M; j++)
			right[pat.charAt(j)] = j;// 包含在模式字符串中的字符的值为它在模式中出现的最右位置,skip=j-right[txt.charAt(i+j)]会出现skip<1的情况
	}

	public int search(String txt) {
		int i, N = txt.length();
		int j, M = pat.length();
		int skip; 
		for (i = 0; i < N - M; i += skip) {
			skip = 0;
			for (j = M - 1; j >= 0; j--) {
				if (txt.charAt(i + j) != pat.charAt(j)) {
					/**
					 * skip为将txt当前字符与pat对齐时，pat需要移动的位数
					 */
					skip = j - right[txt.charAt(i + j)]; 
					/**
					 * txt字符在pat当前字符的左边，只能将pat向后移一位
					 */
					if (skip < 1)
						skip = 1;
					break;
				}

			}

			if (skip == 0)
				return i; // 找到匹配

		}
		return N;// 未找到匹配
	}
}

RK

将pat转化为一个R进制的数，用除留余数法除以一个较大的素数Q取余得到pat的hash值
计算txt的前M（pat的长度）个字符的hash值
1、如果相等，认为匹配成功（当Q极大时，hash相等但不同字符串的概率极低）
2、如果不相等，减去最左边的字符代表的值（注意是R进制）加上下一个字符的值，再求hash值，重复之前的步骤

public class RabinKarp {
	private String pat;// 用于拉斯维加斯算法的check
	private long patHash;// 模式字符串的散列值
	private int M;// 模式字符串的长度
	private long Q;// 一个很大的素数
	private int R = 256;// 8为ASCII表的元素数量
	private long RM;// R^(M-1)%Q

	public RabinKarp(String pat) {
		this.pat = pat;
		this.M = pat.length();
		Q = longRandomPrime();
		RM = 1;
		// 计算R^(M-1)%Q
		for (int i = 1; i <= M - 1; i++)
			RM = (R * RM) % Q;
		patHash = hash(pat, M);
	}

	public int search(String txt) {
		int N = txt.length();
		long txtHash = hash(txt, M);// 获取文本前M个字符串的hash值
		if (txtHash == patHash && check(0)) // 一开始就命中
			return 0;
		// 文本字符开始右移并进行匹配
		for (int i = M; i < N; i++) {
			txtHash = (txtHash + Q - RM * txt.charAt(i - M) % Q) % Q;
			txtHash = (txtHash * R + txt.charAt(i)) % Q;

			/**
			 * 上面两行等价于下面这一行代码，见书中图5.3.17 i>=5的部分
			 */
			// txtHash=((txtHash+txt.charAt(i-M)*(Q-RM))*R+txt.charAt(i))%Q;

			if (patHash == txtHash)
				if (check(i - M + 1))
					return i - M + 1;// 找到匹配

		}
		return N;// 未找到匹配

	}

	// 可蒙特算法
	public boolean check(int i) {
		return true;
	}

	// 拉斯维加斯算法
	public boolean check(String txt, int i) {
		for (int j = 0; j < M; j++) {
			if (txt.charAt(j + i) != pat.charAt(j))
				return false;
		}

		return true;
	}

	// 计算字符串的散列值，散列值为256进制的key值余Q
	private long hash(String key, int M) {
		long h = 0;
		for (int i = 0; i < M; i++)
			h = (R * h + key.charAt(i)) % Q;
		return h;
	}

	// 返回31位的随机素数
	private long longRandomPrime() {
		BigInteger prime = BigInteger.probablePrime(31, new Random());
		return prime.longValue();// 转换为long类型
	}
}