tasty solution to word break finder
authorChris Koeritz <fred@gruntose.com>
Sun, 1 Jan 2017 22:52:55 +0000 (17:52 -0500)
committerChris Koeritz <fred@gruntose.com>
Sun, 1 Jan 2017 22:52:55 +0000 (17:52 -0500)
wrote up the solution on the whiteboard, trying to remember to speak my way through it.  then turned that solution into real code, to check where i was hosing up.  need to remember static void main(String args[]) rather than C variants.  got most function names pretty right for simple stuff.  code was actually correct modulo a couple infinitely looping bugs.  oof.

kona/src/org/feistymeow/textual/SimpleDictionary.java [new file with mode: 0644]
kona/src/org/feistymeow/textual/WordBreakFinder.java [new file with mode: 0644]
scripts/cleaning_progress.txt [new file with mode: 0644]

diff --git a/kona/src/org/feistymeow/textual/SimpleDictionary.java b/kona/src/org/feistymeow/textual/SimpleDictionary.java
new file mode 100644 (file)
index 0000000..6d71f9b
--- /dev/null
@@ -0,0 +1,42 @@
+package org.feistymeow.textual;
+
+import java.util.HashSet;
+import java.util.Set;
+
+public class SimpleDictionary extends HashSet<String>
+//or alternatively, BinaryTree<String>
+//=> what is BST implem for java!  is it balanced?
+{
+  public SimpleDictionary (Set<String> words) {
+    addAll(words);
+    computeLongestWord();
+  }
+  
+  public SimpleDictionary (String words[]) {
+       for (String word : words) {
+               add(word);
+       }
+    computeLongestWord();
+  }
+
+  public int computeLongestWord() {
+    previouslyComputedLongestWord = 1;
+    
+    //hmmm: iterate on set to find longest.
+    
+//kludge implem placeholder.
+previouslyComputedLongestWord = 100;
+       return previouslyComputedLongestWord;
+  }
+  
+  public boolean lookup(String toFind) {
+    return contains(toFind);
+  }
+
+  public int longestWord() {
+    return previouslyComputedLongestWord;
+  }
+
+  int previouslyComputedLongestWord;
+}
+
diff --git a/kona/src/org/feistymeow/textual/WordBreakFinder.java b/kona/src/org/feistymeow/textual/WordBreakFinder.java
new file mode 100644 (file)
index 0000000..17388a8
--- /dev/null
@@ -0,0 +1,73 @@
+package org.feistymeow.textual;
+
+import java.util.HashSet;
+import java.util.Set;
+
+public class WordBreakFinder
+{
+  public WordBreakFinder(SimpleDictionary lookups) {
+    _lookups = lookups;
+  }
+
+  public Set<String> findStringsWithGoodSpacing(String orig) {
+    HashSet<String> toReturn = new HashSet<String>();
+    int startingChunk = Math.min(orig.length(), _lookups.longestWord());
+    for (int i = startingChunk; i >= 1; i--) {
+      String first = orig.substring(0, i);
+      if (!_lookups.lookup(first)) {
+        // fail fast.  this length chunk of the word doesn't match.
+        continue;
+      }
+      String second = orig.substring(i, orig.length());
+      // first part of string is listed in dictionary.  we can trivially add it if the other part is empty.
+      if (second.length() == 0) {
+         toReturn.add(first);
+         continue;
+      }
+      Set<String> matches = findStringsWithGoodSpacing(second);
+      if (matches != null) {
+        for (String matched : matches) {
+           toReturn.add(first + " " + matched);
+        }
+      }
+    }
+    if (toReturn.isEmpty()) return null;
+    return toReturn;
+  }
+
+  SimpleDictionary _lookups;  
+
+  public static void main(String argv[]) {
+    String smallWordList[] = {
+       "hops", "barley", "malt", "beer", "turnip"
+    };
+    SimpleDictionary lookups = new SimpleDictionary(smallWordList);
+    WordBreakFinder finder = new WordBreakFinder(lookups);
+    
+    String toBreak = "maltbeerbeerturniphops";
+    Set<String> matches =  finder.findStringsWithGoodSpacing(toBreak);
+    if (matches != null) {
+      System.out.println("matches found:");
+      for (String match : matches) {
+         System.out.println(match);
+      }
+    } else {
+      System.out.println("ERROR: failed to find matches!");
+    }
+
+    toBreak = "maltbeerbeturniphops";
+    matches =  finder.findStringsWithGoodSpacing(toBreak);
+    if (matches != null) {
+      System.out.println("ERROR: matches found:");
+      for (String match : matches) {
+         System.out.println(match);
+      }
+    } else {
+      System.out.println("found no matches, which is correct.");
+    }
+  }
+
+}
+
+
+
diff --git a/scripts/cleaning_progress.txt b/scripts/cleaning_progress.txt
new file mode 100644 (file)
index 0000000..2759cee
--- /dev/null
@@ -0,0 +1,4 @@
+
+
+starting at:
+archival 2017-01-01