From: Chris Koeritz Date: Sun, 1 Jan 2017 22:52:55 +0000 (-0500) Subject: tasty solution to word break finder X-Git-Tag: 2.140.90~298 X-Git-Url: https://feistymeow.org/gitweb/?p=feisty_meow.git;a=commitdiff_plain;h=e6b0505dafd7d10d7c36757493e9cd05cbfda99d tasty solution to word break finder wrote up the solution on the whiteboard, trying to remember to speak my way through it. then turned that solution into real code, to check where i was hosing up. need to remember static void main(String args[]) rather than C variants. got most function names pretty right for simple stuff. code was actually correct modulo a couple infinitely looping bugs. oof. --- diff --git a/kona/src/org/feistymeow/textual/SimpleDictionary.java b/kona/src/org/feistymeow/textual/SimpleDictionary.java new file mode 100644 index 00000000..6d71f9b8 --- /dev/null +++ b/kona/src/org/feistymeow/textual/SimpleDictionary.java @@ -0,0 +1,42 @@ +package org.feistymeow.textual; + +import java.util.HashSet; +import java.util.Set; + +public class SimpleDictionary extends HashSet +//or alternatively, BinaryTree +//=> what is BST implem for java! is it balanced? +{ + public SimpleDictionary (Set words) { + addAll(words); + computeLongestWord(); + } + + public SimpleDictionary (String words[]) { + for (String word : words) { + add(word); + } + computeLongestWord(); + } + + public int computeLongestWord() { + previouslyComputedLongestWord = 1; + + //hmmm: iterate on set to find longest. + +//kludge implem placeholder. +previouslyComputedLongestWord = 100; + return previouslyComputedLongestWord; + } + + public boolean lookup(String toFind) { + return contains(toFind); + } + + public int longestWord() { + return previouslyComputedLongestWord; + } + + int previouslyComputedLongestWord; +} + diff --git a/kona/src/org/feistymeow/textual/WordBreakFinder.java b/kona/src/org/feistymeow/textual/WordBreakFinder.java new file mode 100644 index 00000000..17388a87 --- /dev/null +++ b/kona/src/org/feistymeow/textual/WordBreakFinder.java @@ -0,0 +1,73 @@ +package org.feistymeow.textual; + +import java.util.HashSet; +import java.util.Set; + +public class WordBreakFinder +{ + public WordBreakFinder(SimpleDictionary lookups) { + _lookups = lookups; + } + + public Set findStringsWithGoodSpacing(String orig) { + HashSet toReturn = new HashSet(); + int startingChunk = Math.min(orig.length(), _lookups.longestWord()); + for (int i = startingChunk; i >= 1; i--) { + String first = orig.substring(0, i); + if (!_lookups.lookup(first)) { + // fail fast. this length chunk of the word doesn't match. + continue; + } + String second = orig.substring(i, orig.length()); + // first part of string is listed in dictionary. we can trivially add it if the other part is empty. + if (second.length() == 0) { + toReturn.add(first); + continue; + } + Set matches = findStringsWithGoodSpacing(second); + if (matches != null) { + for (String matched : matches) { + toReturn.add(first + " " + matched); + } + } + } + if (toReturn.isEmpty()) return null; + return toReturn; + } + + SimpleDictionary _lookups; + + public static void main(String argv[]) { + String smallWordList[] = { + "hops", "barley", "malt", "beer", "turnip" + }; + SimpleDictionary lookups = new SimpleDictionary(smallWordList); + WordBreakFinder finder = new WordBreakFinder(lookups); + + String toBreak = "maltbeerbeerturniphops"; + Set matches = finder.findStringsWithGoodSpacing(toBreak); + if (matches != null) { + System.out.println("matches found:"); + for (String match : matches) { + System.out.println(match); + } + } else { + System.out.println("ERROR: failed to find matches!"); + } + + toBreak = "maltbeerbeturniphops"; + matches = finder.findStringsWithGoodSpacing(toBreak); + if (matches != null) { + System.out.println("ERROR: matches found:"); + for (String match : matches) { + System.out.println(match); + } + } else { + System.out.println("found no matches, which is correct."); + } + } + +} + + + diff --git a/scripts/cleaning_progress.txt b/scripts/cleaning_progress.txt new file mode 100644 index 00000000..2759cee6 --- /dev/null +++ b/scripts/cleaning_progress.txt @@ -0,0 +1,4 @@ + + +starting at: +archival 2017-01-01