feisty meow concerns codebase  2.140
bookmark_tree.cpp
Go to the documentation of this file.
1 /*****************************************************************************\
2 * *
3 * Name : bookmark_tree *
4 * Author : Chris Koeritz *
5 * *
6 *******************************************************************************
7 * Copyright (c) 2005-$now By Author. This program is free software; you can *
8 * redistribute it and/or modify it under the terms of the GNU General Public *
9 * License as published by the Free Software Foundation; either version 2 of *
10 * the License or (at your option) any later version. This is online at: *
11 * http://www.fsf.org/copyleft/gpl.html *
12 * Please send any updates to: fred@gruntose.com *
13 \*****************************************************************************/
14 
15 #include "bookmark_tree.h"
16 
17 #include <basis/astring.h>
18 #include <basis/functions.h>
19 #include <basis/guards.h>
20 #include <filesystem/byte_filer.h>
21 #include <filesystem/filename.h>
23 #include <loggers/file_logger.h>
25 #include <nodes/symbol_tree.h>
26 #include <structures/amorph.h>
29 #include <textual/list_parsing.h>
30 #include <textual/parser_bits.h>
31 
33 
34 using namespace basis;
35 using namespace filesystem;
36 using namespace loggers;
37 using namespace nodes;
38 using namespace structures;
39 using namespace textual;
40 
41 #define DEBUG_MARKS
42  // uncomment to have more debugging noise, but a reasonable amount.
43 //#define DEBUG_MARKS_TREE
44  // uncomment to get crazy noisy debug noise about tree traversal.
45 
46 #undef BASE_LOG
47 #define BASE_LOG(s) program_wide_logger::get().log(s)
48 #define SHOW_LINE a_sprintf("line %d: ", _line_number)
49 #undef LOG
50 #define LOG(s) CLASS_EMERGENCY_LOG(program_wide_logger::get(), SHOW_LINE + s)
51 #define DEADLY_LINE (astring(func) + a_sprintf(", line %d: ", _line_number))
52 
53 const int ESTIMATED_ELEMENTS = 100;
54  // we're planning for about this many links to be efficiently handled.
55 
56 const int MAX_LINE_SIZE = 256 * KILOBYTE;
57  // the largest line we'll process in the links database.
58 
59 // used to compare two strings while ignoring case; we use this to find
60 // our categories in the symbol table.
61 bool case_insense_compare(const astring &a, const astring &b)
62 { return a.iequals(b); }
63 
65 
66 listo_links::listo_links() : amorph<link_record>(), _next_index(0) {}
67 
68 void listo_links::add(link_record *new_rec, bool sort)
69 {
70  // we don't sort blank lines--they just get dropped in the end of
71  // the section.
72  if (sort && new_rec->_description.t()) {
73  for (int i = _next_index; i < elements(); i++) {
74  const astring &desc_cur = borrow(i)->_description;
75 //this check doesn't make much sense; it only checks if the description is equal?
76 // if it were really sorting, wouldn't it need to check if the check is greater than current?
77  if (desc_cur.iequals(new_rec->_description)
78 // || shouldn't there be a case for this being greater than the current???
79  || !desc_cur) {
80  insert(i + 1, 1);
81  put(i + 1, new_rec);
82  return;
83  }
84  }
85  }
86  append(new_rec);
87  if (!sort)
88  _next_index = elements();
89 }
90 
92 
93 class symbol_int : public symbol_table<int>
94 {
95 public:
96  symbol_int() : symbol_table<int>(10) {}
97 };
98 
100 
102 : _line_number(0),
103  _mark_tree(new inner_mark_tree("Root", 0, ESTIMATED_ELEMENTS)),
104  _link_count(0),
105  _category_count(0),
106  _last_parent(_mark_tree),
107  _last_node(_mark_tree),
108  _links_seen(new symbol_int),
109  _category_names(new string_table)
110 {}
111 
113 {
114  WHACK(_links_seen);
115  WHACK(_mark_tree);
116  WHACK(_category_names);
117 }
118 
119 void bookmark_tree::break_name(const astring &to_break, astring &name,
120  astring &nick)
121 {
122  nick = astring::empty_string();
123  name = to_break;
124  int indy = name.find('[');
125  if (negative(indy)) return;
126  nick = name.substring(indy + 1, name.end());
127  while ( (nick[nick.end()] == ' ') || (nick[nick.end()] == ']') )
128  nick.zap(nick.end(), nick.end());
129  name.zap(indy, name.end());
130  name.strip_spaces();
131  nick.strip_spaces();
132 }
133 
134 inner_mark_tree &bookmark_tree::access_root() { return *_mark_tree; }
135 
137 {
138  FUNCDEF("magic_category_comparison");
139 //LOG(astring("compare: a=") + a + " b=" + b);
140  if (a.iequals(b)) return true;
141  astring a_name, a_nick;
142  break_name(a, a_name, a_nick);
143  astring b_name, b_nick;
144  break_name(b, b_name, b_nick);
145  if (a_name.iequals(b_name)) return true;
146  if (a_nick.t() && a_nick.iequals(b_name)) return true;
147  if (b_nick.t() && a_name.iequals(b_nick)) return true;
148  if (a_nick.t() && b_nick.t() && a_nick.iequals(b_nick)) return true;
149  return false;
150 }
151 
152 const astring &HTTP_HEAD = "http://";
153 const astring &FTP_HEAD = "ftp://";
154 const astring &WWW_SITE = "www.";
155 const astring &FTP_SITE = "ftp.";
156 
157 bool bookmark_tree::advance(int &index, const astring &check, const astring &finding)
158 {
159  if (check.compare(finding, index, 0, finding.length(), false)) {
160  index += finding.length();
161  return true;
162  }
163  return false;
164 }
165 
167 {
168  int to_return = 0;
169  advance(to_return, to_prune, HTTP_HEAD);
170  advance(to_return, to_prune, FTP_HEAD);
171  advance(to_return, to_prune, WWW_SITE);
172  advance(to_return, to_prune, FTP_SITE);
173  return to_return;
174 }
175 
177 {
178 //printf("%s\n", (astring("pruned=") + to_prune.substring(find_prune_point(to_prune), to_prune.end())).s());
179  return to_prune.substring(find_prune_point(to_prune), to_prune.end()); }
180 
182 {
183  int prune_a = find_prune_point(a);
184  int prune_b = find_prune_point(b);
185  int bigger_len = maximum(a.length() - prune_a, b.length() - prune_b);
186  bool to_return = a.compare(b, prune_a, prune_b, bigger_len, false);
187 //if (to_return) printf("%s and %s are equal.", a.s(), b.s());
188  return to_return;
189 }
190 
192 {
193  FUNCDEF("find_parent");
194  // first, look for the node above the last parent.
195  inner_mark_tree *parent = dynamic_cast<inner_mark_tree *>
196  (_last_parent->find(parent_name, inner_mark_tree::recurse_upward,
198 
199 #ifdef DEBUG_MARKS_TREE
200  if (parent)
201  LOG(astring("trying upwards find for parent node ") + parent_name);
202 #endif
203 
204  if (!parent) {
205 #ifdef DEBUG_MARKS_TREE
206  LOG(astring("upwards find failed seeking on last_parent node ")
207  + parent_name);
208 #endif
209 
210  // we didn't find the parent above the last category...
211  parent = dynamic_cast<inner_mark_tree *>(_last_node->find(parent_name,
213  }
214 
215  if (!parent) {
216 #ifdef DEBUG_MARKS_TREE
217  LOG(astring("upwards find failed seeking on last_node ") + parent_name);
218 #endif
219 
220  // last node didn't help either.
221  parent = dynamic_cast<inner_mark_tree *>(_mark_tree->find(parent_name,
223  }
224  if (!parent) {
225  // failed to find the parent node, so hook it to the root node.
226  LOG(astring("failed to find parent node ") + parent_name);
227 
228  // create a parent node and use it for this guy.
229  inner_mark_tree *new_node = new inner_mark_tree(parent_name,
231  _mark_tree->attach(new_node);
232  _mark_tree->sort();
233  _category_count++;
234 
235  parent = new_node;
236  } else {
237 #ifdef DEBUG_MARKS_TREE
238  LOG(astring("found parent node ") + parent_name);
239 #endif
240  }
241 
242  return parent;
243 }
244 
246 {
247  FUNCDEF("process_category");
248  const astring &category_name = items[1];
249  const astring &parent_name = items[2];
250 
251  if (items.length() > 3) {
252  // complain about a possibly malformed category.
253  LOG(astring("category ") + category_name + " under " + parent_name
254  + " has extra fields!");
255  }
256 
257 //BASE_LOG("CURRENT:");
258 //BASE_LOG(_mark_tree->text_form());
259 
260  // make sure we don't add anything to the tree if this is the root.
261  if (!parent_name || magic_category_comparison("Root", category_name)) {
262 #ifdef DEBUG_MARKS_TREE
263  LOG(astring("skipping parent node for ") + category_name);
264 #endif
265  return _mark_tree;
266  }
267 
268  // ensure that the categories aren't competing with other names.
269  astring main_name, nickname;
270  break_name(category_name, main_name, nickname);
271  astring *found1 = _category_names->find(main_name, case_insense_compare);
272  astring *found2 = _category_names->find(nickname, case_insense_compare);
273  if (found1 || found2) {
274  astring catnames;
275  if (found1) catnames = *found1; // add the first match, if it exists.
276  if (found2) {
277  if (!!catnames) catnames += " and ";
278  catnames += *found2;
279  }
280  LOG(astring("category name \"") + category_name
281  + "\" in conflict with existing: " + catnames);
282  inner_mark_tree *fake_it = NULL_POINTER;
283 
284 //hmmm: neither of these are right; they need to use a comparator that
285 // uses our magic comparison function.
286 
287  if (found1) {
288 #ifdef DEBUG_MARKS
289  LOG(astring("found existing category for main name: ") + main_name);
290 #endif
291 // fake_it = (inner_mark_tree *)_mark_tree->find(*found1,
292 // symbol_tree::recurse_downward);
293  fake_it = dynamic_cast<inner_mark_tree *>(_mark_tree->find
296  }
297  if (fake_it) {
298 #ifdef DEBUG_MARKS
299  LOG(astring("returning existing category for main name: ") + main_name
300  + " as: " + fake_it->name());
301 #endif
302  return fake_it;
303  }
304  if (found2) {
305 #ifdef DEBUG_MARKS
306  LOG(astring("found existing category for nickname: ") + nickname);
307 #endif
310  fake_it = dynamic_cast<inner_mark_tree *>(_mark_tree->find
313  }
314  if (fake_it) {
315 #ifdef DEBUG_MARKS
316  LOG(astring("returning existing category for nickname: ") + nickname
317  + " as: " + fake_it->name());
318 #endif
319  return fake_it;
320  }
321  LOG("==> failure to find a match for either category!");
322  deadly_error(class_name(), func, "collision resolution code failed; "
323  "please fix category error");
324  return NULL_POINTER;
325  }
326  // now that we know these names are unique, we'll add them into the list
327  // so future categories can't reuse these.
328  _category_names->add(main_name, main_name);
329  if (!!nickname) _category_names->add(nickname, nickname);
330 
331  inner_mark_tree *parent = find_parent(parent_name);
332  _last_parent = parent; // set the parent for the next time.
333 
334  // see if the category is already present under the parent.
335  for (int i = 0; i < parent->branches(); i++) {
336  inner_mark_tree *curr = dynamic_cast<inner_mark_tree *>(parent->branch(i));
337  if (!curr)
338  non_continuable_error(class_name(), DEADLY_LINE, "missing branch in tree");
339 #ifdef DEBUG_MARKS_TREE
340  LOG(astring("looking at branch ") + curr->name());
341 #endif
342  if (magic_category_comparison(curr->name(), category_name)) {
343  // it already exists? argh.
344  LOG(astring("category ") + category_name + " already exists under "
345  + parent_name + ".");
346  _last_node = curr;
347  return curr;
348  }
349  }
350 
351  inner_mark_tree *new_node = new inner_mark_tree(category_name,
353  parent->attach(new_node);
354  parent->sort();
355  _last_node = new_node;
356 
357  _category_count++;
358 
359 #ifdef DEBUG_MARKS_TREE
360  LOG(astring("attaching node ") + category_name + " to parent "
361  + parent->name());
362 #endif
363  return new_node;
364 }
365 
367 {
368  FUNCDEF("process_link");
369  astring description = items[1];
370  astring parent_name = items[2];
371  astring url = "UNKNOWN";
372  if (items.length() >= 4) url = items[3];
373 
374  // strip any directory slashes that are provided as a suffix. we don't need
375  // them and they tend to confuse the issue when we look for duplicates.
376  while (url[url.end()] == '/') {
377  url.zap(url.end(), url.end());
378  }
379 
380  // make some noise if they seem to have a badly formed link.
381  if (items.length() < 4) {
382  LOG(astring("link ") + description + " under " + parent_name
383  + " has no URL!");
384  } else if (items.length() > 4) {
385  LOG(astring("link ") + description + " under " + parent_name
386  + " has extra fields!");
387  }
388 
389  // find the parent for this link.
390  inner_mark_tree *parent = find_parent(parent_name);
391  _last_parent = parent; // set the parent for the next time.
392 
393  // see if the link already exists.
394  int *found = _links_seen->find(url, excellent_link_comparator);
395  if (found) {
396  // this is not so great; a duplicate link has been found.
397  LOG(a_sprintf("Duplicate Link: line %d already has ", *found) + url);
398  return;
399  } else {
400  _links_seen->add(prune_link_down(url), _line_number);
401  }
402 
403  // add the link to the parent.
404  link_record *new_rec = new link_record(description, url, _line_number);
405  parent->_links.add(new_rec);
406 
407  _link_count++;
408 }
409 
410 void bookmark_tree::process_comment(const astring &current_line_in)
411 {
412  FUNCDEF("process_comment");
413  astring current_line = current_line_in;
414 
415  // output the comment as simple text.
416 //BASE_LOG("comment case");
417  if (current_line.contains("http:")) {
418  astring hold_curr = current_line;
419  int indy = current_line.find("http:");
420  hold_curr.zap(0, indy - 1);
421  current_line = astring("&nbsp; &nbsp; &nbsp; &nbsp; "
422  "<a href=\"") + hold_curr + "\">" + hold_curr + "</a>";
423  } else if (current_line.t()) {
424  // snap the comment character off of the front.
425  current_line.zap(0, 0);
426  }
427 
428  link_record *new_rec = new link_record(current_line,
429  astring::empty_string(), _line_number);
430  _last_node->_links.add(new_rec, false);
431 }
432 
433 int bookmark_tree::read_csv_file(const astring &input_filename)
434 {
435  FUNCDEF("read_csv_file");
436  byte_filer input_file(input_filename, "r");
437  if (!input_file.good())
438  non_continuable_error(class_name(), DEADLY_LINE,
439  "the input file could not be opened");
440 
441  string_array items; // parsed in csv line.
442  astring current_line; // read from input file.
443 
444  // read the lines in the file, one at a time.
445  while (input_file.getline(current_line, MAX_LINE_SIZE) > 0) {
446  _line_number++; // go to the next line.
447  // remove the carriage returns first.
448  while (parser_bits::is_eol(current_line[current_line.end()])) {
449  current_line.zap(current_line.end(), current_line.end());
450  }
451  current_line.strip_spaces();
452  if (!current_line.length()) {
453 // // blank lines get treated as a special case. they are always added
454 // // at the end of the list.
455 // process_comment(current_line);
456  continue;
457  } else if (current_line[0] == '#') {
458  // handle a comment in the database.
459  process_comment(current_line);
460  } else {
461  // csv parse the line, since we don't support much else.
462  bool parsed = list_parsing::parse_csv_line(current_line, items);
463  if (!parsed)
464  non_continuable_error(class_name(), DEADLY_LINE,
465  astring("the line could not be parsed: ") + current_line);
466  if (!items.length()) {
467  LOG("bad formatting on this line.");
468  continue;
469  }
470  if (items[0].iequals("C")) {
472  if (!node) {
473  LOG(astring("failed to get a node for ") + items[1]);
474  }
475  } else if (items[0].iequals("L")) {
476  process_link(items);
477  } else {
478  non_continuable_error(class_name(), DEADLY_LINE,
479  astring("unknown format in line: ") + current_line);
480  }
481  }
482  }
483  return 0;
484 }
485 
const astring & WWW_SITE
#define DEADLY_LINE
#define LOG(s)
bool case_insense_compare(const astring &a, const astring &b)
const astring & HTTP_HEAD
const astring & FTP_HEAD
const astring & FTP_SITE
const int MAX_LINE_SIZE
const int ESTIMATED_ELEMENTS
a_sprintf is a specialization of astring that provides printf style support.
Definition: astring.h:440
int length() const
Returns the current reported length of the allocated C array.
Definition: array.h:115
Provides a dynamically resizable ASCII character string.
Definition: astring.h:35
bool t() const
t() is a shortcut for the string being "true", as in non-empty.
Definition: astring.h:97
virtual void zap(int start, int end)
Deletes the characters between "start" and "end" inclusively.
Definition: astring.cpp:521
bool substring(astring &target, int start, int end) const
a version that stores the substring in an existing "target" string.
Definition: astring.cpp:865
void strip_spaces(how_to_strip way=FROM_BOTH_SIDES)
removes excess space characters from string's beginning, end or both.
Definition: astring.h:325
bool iequals(const astring &that) const
returns true if this is case-insensitively equal to "that".
Definition: astring.cpp:565
int end() const
returns the index of the last (non-null) character in the string.
Definition: astring.h:86
int length() const
Returns the current length of the string.
Definition: astring.cpp:132
bool compare(const astring &to_compare, int start_first, int start_second, int count, bool case_sensitive) const
Compares "this" string with "to_compare".
Definition: astring.cpp:810
int find(char to_find, int position=0, bool reverse=false) const
Locates "to_find" in "this".
Definition: astring.cpp:574
bool contains(const astring &to_find) const
Returns true if "to_find" is contained in this string or false if not.
Definition: astring.cpp:162
static int find_prune_point(const basis::astring &to_prune)
attempts to locate the real start of the root URL in "to_prune".
inner_mark_tree * process_category(const structures::string_array &items)
inner_mark_tree * find_parent(const basis::astring &parent_name)
void process_comment(const basis::astring &current_line_in)
int read_csv_file(const basis::astring &input_filename)
void process_link(const structures::string_array &items)
static bool advance(int &index, const basis::astring &check, const basis::astring &finding)
moves the "index" forward if the "finding" string is the head of "check".
virtual ~bookmark_tree()
static basis::astring prune_link_down(const basis::astring &to_prune)
static bool magic_category_comparison(const basis::astring &a, const basis::astring &b)
static bool excellent_link_comparator(const basis::astring &a, const basis::astring &b)
static void break_name(const basis::astring &to_break, basis::astring &name, basis::astring &nick)
inner_mark_tree & access_root()
Provides file managment services using the standard I/O support.
Definition: byte_filer.h:32
int getline(basis::abyte *buffer, int desired_size)
reads a line of text (terminated by a return) into the "buffer".
Definition: byte_filer.cpp:201
bool good()
returns true if the file seems to be in the appropriate desired state.
Definition: byte_filer.cpp:103
listo_links _links
An object representing the interstitial cell in most linked data structures.
Definition: node.h:41
void sort()
sorts the sub-nodes of this symbol_tree.
symbol_tree * find(const basis::astring &to_find, find_methods how, basis::string_comparator_function *comp=NULL_POINTER)
returns the node specified by "to_find" or NULL_POINTER.
const basis::astring & name() const
returns the name of this node.
Definition: symbol_tree.cpp:78
symbol_tree * branch(int index) const
returns the "index"th branch.
virtual int branches() const
Returns the number of branches currently connected to this tree.
Definition: tree.cpp:431
virtual void attach(tree *new_branch)
Attaches the specified branch to the current tree.
Definition: tree.cpp:453
int elements() const
the maximum number of elements currently allowed in this amorph.
Definition: amorph.h:66
basis::outcome put(int field, const link_record *data)
Enters an object into the field at index "field" in the amorph.
Definition: amorph.h:370
basis::outcome append(const link_record *data)
puts "data" on the end of this amorph.
Definition: amorph.h:303
basis::outcome insert(int position, int lines_to_add)
Adds "lines_to_add" indices to the amorph at the index "position".
Definition: amorph.h:345
link_record * borrow(int field)
Returns a pointer to the information at the index "field".
Definition: amorph.h:448
An array of strings with some additional helpful methods.
Definition: string_array.h:32
Provides a symbol_table that holds strings as the content.
Definition: string_table.h:32
Maintains a list of names, where each name has a type and some contents.
Definition: symbol_table.h:36
contents * find(const basis::astring &name) const
returns the contents held for "name" or NULL_POINTER if it wasn't found.
Definition: symbol_table.h:313
basis::outcome add(const basis::astring &name, const contents &storage)
Enters a symbol name into the table along with some contents.
Definition: symbol_table.h:383
#define non_continuable_error(c, f, i)
an extra piece of information used, if available, in bounds_halt below.
#define deadly_error(c, f, i)
#define NULL_POINTER
The value representing a pointer to nothing.
Definition: definitions.h:32
#define FUNCDEF(func_in)
FUNCDEF sets the name of a function (and plugs it into the callstack).
Definition: enhance_cpp.h:57
The guards collection helps in testing preconditions and reporting errors.
Definition: array.h:30
void WHACK(contents *&ptr)
deletion with clearing of the pointer.
Definition: functions.h:121
type maximum(type a, type b)
minimum returns the lesser of two values.
Definition: functions.h:26
bool negative(const type &a)
negative returns true if "a" is less than zero.
Definition: functions.h:43
const int KILOBYTE
Number of bytes in a kilobyte.
Definition: definitions.h:134
A platform independent way to obtain the timestamp of a file.
Definition: byte_filer.cpp:37
A logger that sends to the console screen using the standard output device.
A dynamic container class that holds any kind of object via pointers.
Definition: amorph.h:55
bool is_eol(char to_check)