feisty meow concerns codebase 2.140
bookmark_tree.cpp
Go to the documentation of this file.
1/*****************************************************************************\
2* *
3* Name : bookmark_tree *
4* Author : Chris Koeritz *
5* *
6*******************************************************************************
7* Copyright (c) 2005-$now By Author. This program is free software; you can *
8* redistribute it and/or modify it under the terms of the GNU General Public *
9* License as published by the Free Software Foundation; either version 2 of *
10* the License or (at your option) any later version. This is online at: *
11* http://www.fsf.org/copyleft/gpl.html *
12* Please send any updates to: fred@gruntose.com *
13\*****************************************************************************/
14
15#include "bookmark_tree.h"
16
17#include <basis/astring.h>
18#include <basis/functions.h>
19#include <basis/guards.h>
21#include <filesystem/filename.h>
23#include <loggers/file_logger.h>
25#include <nodes/symbol_tree.h>
26#include <structures/amorph.h>
30#include <textual/parser_bits.h>
31
33
34using namespace basis;
35using namespace filesystem;
36using namespace loggers;
37using namespace nodes;
38using namespace structures;
39using namespace textual;
40
41#define DEBUG_MARKS
42 // uncomment to have more debugging noise, but a reasonable amount.
43//#define DEBUG_MARKS_TREE
44 // uncomment to get crazy noisy debug noise about tree traversal.
45
46#undef BASE_LOG
47#define BASE_LOG(s) program_wide_logger::get().log(s)
48#define SHOW_LINE a_sprintf("line %d: ", _line_number)
49#undef LOG
50#define LOG(s) CLASS_EMERGENCY_LOG(program_wide_logger::get(), SHOW_LINE + s)
51#define DEADLY_LINE (astring(func) + a_sprintf(", line %d: ", _line_number))
52
53const int ESTIMATED_ELEMENTS = 100;
54 // we're planning for about this many links to be efficiently handled.
55
56const int MAX_LINE_SIZE = 256 * KILOBYTE;
57 // the largest line we'll process in the links database.
58
59// used to compare two strings while ignoring case; we use this to find
60// our categories in the symbol table.
61bool case_insense_compare(const astring &a, const astring &b)
62{ return a.iequals(b); }
63
65
67
68void listo_links::add(link_record *new_rec, bool sort)
69{
70 // we don't sort blank lines--they just get dropped in the end of
71 // the section.
72 if (sort && new_rec->_description.t()) {
73 for (int i = _next_index; i < elements(); i++) {
74 const astring &desc_cur = borrow(i)->_description;
75//this check doesn't make much sense; it only checks if the description is equal?
76// if it were really sorting, wouldn't it need to check if the check is greater than current?
77 if (desc_cur.iequals(new_rec->_description)
78// || shouldn't there be a case for this being greater than the current???
79 || !desc_cur) {
80 insert(i + 1, 1);
81 put(i + 1, new_rec);
82 return;
83 }
84 }
85 }
86 append(new_rec);
87 if (!sort)
88 _next_index = elements();
89}
90
92
93class symbol_int : public symbol_table<int>
94{
95public:
96 symbol_int() : symbol_table<int>(10) {}
97};
98
100
102: _line_number(0),
103 _mark_tree(new inner_mark_tree("Root", 0, ESTIMATED_ELEMENTS)),
104 _link_count(0),
105 _category_count(0),
106 _last_parent(_mark_tree),
107 _last_node(_mark_tree),
108 _links_seen(new symbol_int),
109 _category_names(new string_table)
110{}
111
113{
114 WHACK(_links_seen);
115 WHACK(_mark_tree);
116 WHACK(_category_names);
117}
118
119void bookmark_tree::break_name(const astring &to_break, astring &name,
120 astring &nick)
121{
122 nick = astring::empty_string();
123 name = to_break;
124 int indy = name.find('[');
125 if (negative(indy)) return;
126 nick = name.substring(indy + 1, name.end());
127 while ( (nick[nick.end()] == ' ') || (nick[nick.end()] == ']') )
128 nick.zap(nick.end(), nick.end());
129 name.zap(indy, name.end());
130 name.strip_spaces();
131 nick.strip_spaces();
132}
133
135
137{
138 FUNCDEF("magic_category_comparison");
139//LOG(astring("compare: a=") + a + " b=" + b);
140 if (a.iequals(b)) return true;
141 astring a_name, a_nick;
142 break_name(a, a_name, a_nick);
143 astring b_name, b_nick;
144 break_name(b, b_name, b_nick);
145 if (a_name.iequals(b_name)) return true;
146 if (a_nick.t() && a_nick.iequals(b_name)) return true;
147 if (b_nick.t() && a_name.iequals(b_nick)) return true;
148 if (a_nick.t() && b_nick.t() && a_nick.iequals(b_nick)) return true;
149 return false;
150}
151
152const astring &HTTP_HEAD = "http://";
153const astring &FTP_HEAD = "ftp://";
154const astring &WWW_SITE = "www.";
155const astring &FTP_SITE = "ftp.";
156
157bool bookmark_tree::advance(int &index, const astring &check, const astring &finding)
158{
159 if (check.compare(finding, index, 0, finding.length(), false)) {
160 index += finding.length();
161 return true;
162 }
163 return false;
164}
165
167{
168 int to_return = 0;
169 advance(to_return, to_prune, HTTP_HEAD);
170 advance(to_return, to_prune, FTP_HEAD);
171 advance(to_return, to_prune, WWW_SITE);
172 advance(to_return, to_prune, FTP_SITE);
173 return to_return;
174}
175
177{
178//printf("%s\n", (astring("pruned=") + to_prune.substring(find_prune_point(to_prune), to_prune.end())).s());
179 return to_prune.substring(find_prune_point(to_prune), to_prune.end()); }
180
182{
183 int prune_a = find_prune_point(a);
184 int prune_b = find_prune_point(b);
185 int bigger_len = maximum(a.length() - prune_a, b.length() - prune_b);
186 bool to_return = a.compare(b, prune_a, prune_b, bigger_len, false);
187//if (to_return) printf("%s and %s are equal.", a.s(), b.s());
188 return to_return;
189}
190
192{
193 FUNCDEF("find_parent");
194 // first, look for the node above the last parent.
195 inner_mark_tree *parent = dynamic_cast<inner_mark_tree *>
196 (_last_parent->find(parent_name, inner_mark_tree::recurse_upward,
198
199#ifdef DEBUG_MARKS_TREE
200 if (parent)
201 LOG(astring("trying upwards find for parent node ") + parent_name);
202#endif
203
204 if (!parent) {
205#ifdef DEBUG_MARKS_TREE
206 LOG(astring("upwards find failed seeking on last_parent node ")
207 + parent_name);
208#endif
209
210 // we didn't find the parent above the last category...
211 parent = dynamic_cast<inner_mark_tree *>(_last_node->find(parent_name,
213 }
214
215 if (!parent) {
216#ifdef DEBUG_MARKS_TREE
217 LOG(astring("upwards find failed seeking on last_node ") + parent_name);
218#endif
219
220 // last node didn't help either.
221 parent = dynamic_cast<inner_mark_tree *>(_mark_tree->find(parent_name,
223 }
224 if (!parent) {
225 // failed to find the parent node, so hook it to the root node.
226 LOG(astring("failed to find parent node ") + parent_name);
227
228 // create a parent node and use it for this guy.
229 inner_mark_tree *new_node = new inner_mark_tree(parent_name,
231 _mark_tree->attach(new_node);
232 _mark_tree->sort();
233 _category_count++;
234
235 parent = new_node;
236 } else {
237#ifdef DEBUG_MARKS_TREE
238 LOG(astring("found parent node ") + parent_name);
239#endif
240 }
241
242 return parent;
243}
244
246{
247 FUNCDEF("process_category");
248 const astring &category_name = items[1];
249 const astring &parent_name = items[2];
250
251 if (items.length() > 3) {
252 // complain about a possibly malformed category.
253 LOG(astring("category ") + category_name + " under " + parent_name
254 + " has extra fields!");
255 }
256
257//BASE_LOG("CURRENT:");
258//BASE_LOG(_mark_tree->text_form());
259
260 // make sure we don't add anything to the tree if this is the root.
261 if (!parent_name || magic_category_comparison("Root", category_name)) {
262#ifdef DEBUG_MARKS_TREE
263 LOG(astring("skipping parent node for ") + category_name);
264#endif
265 return _mark_tree;
266 }
267
268 // ensure that the categories aren't competing with other names.
269 astring main_name, nickname;
270 break_name(category_name, main_name, nickname);
271 astring *found1 = _category_names->find(main_name, case_insense_compare);
272 astring *found2 = _category_names->find(nickname, case_insense_compare);
273 if (found1 || found2) {
274 astring catnames;
275 if (found1) catnames = *found1; // add the first match, if it exists.
276 if (found2) {
277 if (!!catnames) catnames += " and ";
278 catnames += *found2;
279 }
280 LOG(astring("category name \"") + category_name
281 + "\" in conflict with existing: " + catnames);
282 inner_mark_tree *fake_it = NULL_POINTER;
283
284//hmmm: neither of these are right; they need to use a comparator that
285// uses our magic comparison function.
286
287 if (found1) {
288#ifdef DEBUG_MARKS
289 LOG(astring("found existing category for main name: ") + main_name);
290#endif
291// fake_it = (inner_mark_tree *)_mark_tree->find(*found1,
292// symbol_tree::recurse_downward);
293 fake_it = dynamic_cast<inner_mark_tree *>(_mark_tree->find
296 }
297 if (fake_it) {
298#ifdef DEBUG_MARKS
299 LOG(astring("returning existing category for main name: ") + main_name
300 + " as: " + fake_it->name());
301#endif
302 return fake_it;
303 }
304 if (found2) {
305#ifdef DEBUG_MARKS
306 LOG(astring("found existing category for nickname: ") + nickname);
307#endif
310 fake_it = dynamic_cast<inner_mark_tree *>(_mark_tree->find
313 }
314 if (fake_it) {
315#ifdef DEBUG_MARKS
316 LOG(astring("returning existing category for nickname: ") + nickname
317 + " as: " + fake_it->name());
318#endif
319 return fake_it;
320 }
321 LOG("==> failure to find a match for either category!");
322 deadly_error(class_name(), func, "collision resolution code failed; "
323 "please fix category error");
324 return NULL_POINTER;
325 }
326 // now that we know these names are unique, we'll add them into the list
327 // so future categories can't reuse these.
328 _category_names->add(main_name, main_name);
329 if (!!nickname) _category_names->add(nickname, nickname);
330
331 inner_mark_tree *parent = find_parent(parent_name);
332 _last_parent = parent; // set the parent for the next time.
333
334 // see if the category is already present under the parent.
335 for (int i = 0; i < parent->branches(); i++) {
336 inner_mark_tree *curr = dynamic_cast<inner_mark_tree *>(parent->branch(i));
337 if (!curr)
338 non_continuable_error(class_name(), DEADLY_LINE, "missing branch in tree");
339#ifdef DEBUG_MARKS_TREE
340 LOG(astring("looking at branch ") + curr->name());
341#endif
342 if (magic_category_comparison(curr->name(), category_name)) {
343 // it already exists? argh.
344 LOG(astring("category ") + category_name + " already exists under "
345 + parent_name + ".");
346 _last_node = curr;
347 return curr;
348 }
349 }
350
351 inner_mark_tree *new_node = new inner_mark_tree(category_name,
353 parent->attach(new_node);
354 parent->sort();
355 _last_node = new_node;
356
357 _category_count++;
358
359#ifdef DEBUG_MARKS_TREE
360 LOG(astring("attaching node ") + category_name + " to parent "
361 + parent->name());
362#endif
363 return new_node;
364}
365
367{
368 FUNCDEF("process_link");
369 astring description = items[1];
370 astring parent_name = items[2];
371 astring url = "UNKNOWN";
372 if (items.length() >= 4) url = items[3];
373
374 // strip any directory slashes that are provided as a suffix. we don't need
375 // them and they tend to confuse the issue when we look for duplicates.
376 while (url[url.end()] == '/') {
377 url.zap(url.end(), url.end());
378 }
379
380 // make some noise if they seem to have a badly formed link.
381 if (items.length() < 4) {
382 LOG(astring("link ") + description + " under " + parent_name
383 + " has no URL!");
384 } else if (items.length() > 4) {
385 LOG(astring("link ") + description + " under " + parent_name
386 + " has extra fields!");
387 }
388
389 // find the parent for this link.
390 inner_mark_tree *parent = find_parent(parent_name);
391 _last_parent = parent; // set the parent for the next time.
392
393 // see if the link already exists.
394 int *found = _links_seen->find(url, excellent_link_comparator);
395 if (found) {
396 // this is not so great; a duplicate link has been found.
397 LOG(a_sprintf("Duplicate Link: line %d already has ", *found) + url);
398 return;
399 } else {
400 _links_seen->add(prune_link_down(url), _line_number);
401 }
402
403 // add the link to the parent.
404 link_record *new_rec = new link_record(description, url, _line_number);
405 parent->_links.add(new_rec);
406
407 _link_count++;
408}
409
410void bookmark_tree::process_comment(const astring &current_line_in)
411{
412 FUNCDEF("process_comment");
413 astring current_line = current_line_in;
414
415 // output the comment as simple text.
416//BASE_LOG("comment case");
417 if (current_line.contains("http:")) {
418 astring hold_curr = current_line;
419 int indy = current_line.find("http:");
420 hold_curr.zap(0, indy - 1);
421 current_line = astring("&nbsp; &nbsp; &nbsp; &nbsp; "
422 "<a href=\"") + hold_curr + "\">" + hold_curr + "</a>";
423 } else if (current_line.t()) {
424 // snap the comment character off of the front.
425 current_line.zap(0, 0);
426 }
427
428 link_record *new_rec = new link_record(current_line,
430 _last_node->_links.add(new_rec, false);
431}
432
433int bookmark_tree::read_csv_file(const astring &input_filename)
434{
435 FUNCDEF("read_csv_file");
436 byte_filer input_file(input_filename, "r");
437 if (!input_file.good())
439 "the input file could not be opened");
440
441 string_array items; // parsed in csv line.
442 astring current_line; // read from input file.
443
444 // read the lines in the file, one at a time.
445 while (input_file.getline(current_line, MAX_LINE_SIZE) > 0) {
446 _line_number++; // go to the next line.
447 // remove the carriage returns first.
448 while (parser_bits::is_eol(current_line[current_line.end()])) {
449 current_line.zap(current_line.end(), current_line.end());
450 }
451 current_line.strip_spaces();
452 if (!current_line.length()) {
453// // blank lines get treated as a special case. they are always added
454// // at the end of the list.
455// process_comment(current_line);
456 continue;
457 } else if (current_line[0] == '#') {
458 // handle a comment in the database.
459 process_comment(current_line);
460 } else {
461 // csv parse the line, since we don't support much else.
462 bool parsed = list_parsing::parse_csv_line(current_line, items);
463 if (!parsed)
465 astring("the line could not be parsed: ") + current_line);
466 if (!items.length()) {
467 LOG("bad formatting on this line.");
468 continue;
469 }
470 if (items[0].iequals("C")) {
472 if (!node) {
473 LOG(astring("failed to get a node for ") + items[1]);
474 }
475 } else if (items[0].iequals("L")) {
476 process_link(items);
477 } else {
479 astring("unknown format in line: ") + current_line);
480 }
481 }
482 }
483 return 0;
484}
485
const astring & WWW_SITE
#define DEADLY_LINE
#define LOG(s)
bool case_insense_compare(const astring &a, const astring &b)
const astring & HTTP_HEAD
const astring & FTP_HEAD
const astring & FTP_SITE
const int MAX_LINE_SIZE
const int ESTIMATED_ELEMENTS
a_sprintf is a specialization of astring that provides printf style support.
Definition astring.h:440
int length() const
Returns the current reported length of the allocated C array.
Definition array.h:115
Provides a dynamically resizable ASCII character string.
Definition astring.h:35
bool t() const
t() is a shortcut for the string being "true", as in non-empty.
Definition astring.h:97
virtual void zap(int start, int end)
Deletes the characters between "start" and "end" inclusively.
Definition astring.cpp:524
bool substring(astring &target, int start, int end) const
a version that stores the substring in an existing "target" string.
Definition astring.cpp:868
static const astring & empty_string()
useful wherever empty strings are needed, e.g., function defaults.
Definition astring.cpp:128
void strip_spaces(how_to_strip way=FROM_BOTH_SIDES)
removes excess space characters from string's beginning, end or both.
Definition astring.h:325
bool iequals(const astring &that) const
returns true if this is case-insensitively equal to "that".
Definition astring.cpp:568
int end() const
returns the index of the last (non-null) character in the string.
Definition astring.h:86
int length() const
Returns the current length of the string.
Definition astring.cpp:132
bool compare(const astring &to_compare, int start_first, int start_second, int count, bool case_sensitive) const
Compares "this" string with "to_compare".
Definition astring.cpp:813
int find(char to_find, int position=0, bool reverse=false) const
Locates "to_find" in "this".
Definition astring.cpp:577
bool contains(const astring &to_find) const
Returns true if "to_find" is contained in this string or false if not.
Definition astring.cpp:162
static int find_prune_point(const basis::astring &to_prune)
attempts to locate the real start of the root URL in "to_prune".
inner_mark_tree * process_category(const structures::string_array &items)
inner_mark_tree * find_parent(const basis::astring &parent_name)
void process_comment(const basis::astring &current_line_in)
int read_csv_file(const basis::astring &input_filename)
void process_link(const structures::string_array &items)
static bool advance(int &index, const basis::astring &check, const basis::astring &finding)
moves the "index" forward if the "finding" string is the head of "check".
virtual ~bookmark_tree()
static basis::astring prune_link_down(const basis::astring &to_prune)
static bool magic_category_comparison(const basis::astring &a, const basis::astring &b)
static bool excellent_link_comparator(const basis::astring &a, const basis::astring &b)
static void break_name(const basis::astring &to_break, basis::astring &name, basis::astring &nick)
inner_mark_tree & access_root()
Provides file managment services using the standard I/O support.
Definition byte_filer.h:32
int getline(basis::abyte *buffer, int desired_size)
reads a line of text (terminated by a return) into the "buffer".
bool good()
returns true if the file seems to be in the appropriate desired state.
listo_links _links
An object representing the interstitial cell in most linked data structures.
Definition node.h:41
void sort()
sorts the sub-nodes of this symbol_tree.
symbol_tree * find(const basis::astring &to_find, find_methods how, basis::string_comparator_function *comp=NULL_POINTER)
returns the node specified by "to_find" or NULL_POINTER.
const basis::astring & name() const
returns the name of this node.
symbol_tree * branch(int index) const
returns the "index"th branch.
virtual int branches() const
Returns the number of branches currently connected to this tree.
Definition tree.cpp:431
virtual void attach(tree *new_branch)
Attaches the specified branch to the current tree.
Definition tree.cpp:453
int elements() const
the maximum number of elements currently allowed in this amorph.
Definition amorph.h:66
basis::outcome put(int field, const link_record *data)
Enters an object into the field at index "field" in the amorph.
Definition amorph.h:370
basis::outcome insert(int position, int lines_to_add)
Adds "lines_to_add" indices to the amorph at the index "position".
Definition amorph.h:345
link_record * borrow(int field)
Returns a pointer to the information at the index "field".
Definition amorph.h:448
An array of strings with some additional helpful methods.
Provides a symbol_table that holds strings as the content.
Maintains a list of names, where each name has a type and some contents.
contents * find(const basis::astring &name) const
returns the contents held for "name" or NULL_POINTER if it wasn't found.
basis::outcome add(const basis::astring &name, const contents &storage)
Enters a symbol name into the table along with some contents.
static bool parse_csv_line(const basis::astring &to_parse, structures::string_array &fields)
examines the string "to_parse" which should be in csv format.
static bool is_eol(char to_check)
returns true if "to_check" is part of an end-of-line sequence.
#define non_continuable_error(c, f, i)
an extra piece of information used, if available, in bounds_halt below.
#define deadly_error(c, f, i)
#define NULL_POINTER
The value representing a pointer to nothing.
Definition definitions.h:32
#define FUNCDEF(func_in)
FUNCDEF sets the name of a function (and plugs it into the callstack).
Definition enhance_cpp.h:54
bool append
Definition makedep.cpp:110
The guards collection helps in testing preconditions and reporting errors.
Definition array.h:30
void WHACK(contents *&ptr)
deletion with clearing of the pointer.
Definition functions.h:121
type maximum(type a, type b)
minimum returns the lesser of two values.
Definition functions.h:26
bool negative(const type &a)
negative returns true if "a" is less than zero.
Definition functions.h:43
const int KILOBYTE
Number of bytes in a kilobyte.
A platform independent way to obtain the timestamp of a file.
A logger that sends to the console screen using the standard output device.
A dynamic container class that holds any kind of object via pointers.
Definition amorph.h:55