feisty meow concerns codebase  2.140
link_parser.cpp
Go to the documentation of this file.
1 /*****************************************************************************\
2 * *
3 * Name : link_parser *
4 * Author : Chris Koeritz *
5 * *
6 * Purpose: *
7 * *
8 * Processes html files and finds the links. A database in the HOOPLE *
9 * link format is created from the links found. *
10 * *
11 *******************************************************************************
12 * Copyright (c) 1991-$now By Author. This program is free software; you can *
13 * redistribute it and/or modify it under the terms of the GNU General Public *
14 * License as published by the Free Software Foundation; either version 2 of *
15 * the License or (at your option) any later version. This is online at: *
16 * http://www.fsf.org/copyleft/gpl.html *
17 * Please send any updates to: fred@gruntose.com *
18 \*****************************************************************************/
19 
20 // Notes:
21 //
22 // the standard link structure in html is similar to this:
23 // <a href="blahblah">Link Name and Launching Point</a>
24 //
25 // the standard we adopt for section titles is that it must be a heading
26 // marker. that formatting looks like this, for example:
27 // <h3 assorted_stuff>The Section Title:</h3>
28 
29 #include "bookmark_tree.h"
30 
32 #include <basis/astring.h>
33 #include <basis/functions.h>
34 #include <basis/guards.h>
35 #include <filesystem/byte_filer.h>
36 #include <filesystem/filename.h>
38 #include <loggers/file_logger.h>
39 #include <structures/stack.h>
41 #include <textual/parser_bits.h>
42 
43 using namespace application;
44 using namespace basis;
45 using namespace filesystem;
46 using namespace loggers;
47 using namespace structures;
48 using namespace textual;
49 
50 #undef BASE_LOG
51 #define BASE_LOG(s) program_wide_logger::get().log(s, ALWAYS_PRINT)
52 #undef LOG
53 #define LOG(s) CLASS_EMERGENCY_LOG(program_wide_logger::get(), s)
54 
55 //#define DEBUG_LINK_PARSER
56  // uncomment for noisier run to seek problems.
57 
59 
60 const int MAX_FILE_SIZE = 4 * MEGABYTE;
61  // this is the largest html file size we will process.
62 
64 
65 // a macro that increments the position in the string and restarts the loop.
66 #define INCREM_N_GO { curr_index++; continue; }
67 
68 // puts the current character on the intermediate string.
69 #define ADD_INTERMEDIATE { \
70  char add_in = full_contents[curr_index]; \
71  if ( (add_in == '<') || (add_in == '>') ) { \
72  add_in = '-'; \
73  } \
74  intermediate_text += add_in; \
75 }
76 
77 // returns a character in lower-case, if 'a' is in upper case.
78 char normalize_char(char a)
79 {
80  if ( (a >= 'A') && (a <= 'Z') ) return a + 'a' - 'A';
81  return a;
82 }
83 
84 // returns true if the two characters are the same, ignoring upper/lower case.
85 bool caseless_equals(char a, char b) { return normalize_char(a) == normalize_char(b); }
86 
87 // a macro that skips all characters until the specified one is seen.
88 #define JUMP_TO_CHAR(to_find, save_them) { \
89  while ( (curr_index < full_contents.length()) \
90  && !caseless_equals(to_find, full_contents[curr_index]) ) { \
91  if (save_them) ADD_INTERMEDIATE; \
92  curr_index++; \
93  } \
94 }
95 
96 // increments the state, the current character and restarts the loop.
97 #define NEXT_STATE_INCREM { \
98  state = parsing_states(state+1); /* move forward in states. */ \
99  curr_index++; \
100  continue; \
101 }
102 
103 // cleans out the disallowed characters in the string provided.
104 #define CLEAN_UP_NAUGHTY(s) { \
105  while (s.replace("\n", " ")) {} \
106  while (s.replace("\r", "")) {} \
107  s.strip_spaces(); \
108 }
109 
110 //was before the strip_spaces code above.
111 /*
112  int indy = s.find("--"); \
113  while (non_negative(indy)) { \
114  s[indy] = ' '; / * replace the first dash with a space. * / \
115  for (int i = indy + 1; i < s.length(); i++) { \
116  if (s[i] != '-') break; \
117  s.zap(i, i); \
118  i--; \
119  } \
120  indy = s.find("--"); \
121  } \
122  while (s.replace(" ", " ")) {} \
123 */
124 
125 // cleans up underscores in areas that are supposed to be english.
126 #define MAKE_MORE_ENGLISH(s) \
127  s.replace_all('_', ' ')
128 
130 {
131  for (int i = 0; i < to_edit.length(); i++) {
132  if (to_edit[i] != '<') continue;
133  // found a left bracket.
134  int indy = to_edit.find('>', i);
135  if (negative(indy)) return; // bail out, unexpected unmatched bracket.
136  to_edit.zap(i, indy);
137  i--; // skip back to reconsider current place.
138  }
139 }
140 
141 // writes out the currently accumulated link info.
142 #define WRITE_LINK { \
143  /* clean naughty characters out of the names. */ \
144  CLEAN_UP_NAUGHTY(url_string); \
145  CLEAN_UP_NAUGHTY(name_string); \
146  /* output a link in the HOOPLE format. */ \
147  astring to_write = "\"L\",\""; \
148  to_write += translate_web_chars(name_string); \
149  to_write += "\",\""; \
150  to_write += abbreviate_category(last_heading); \
151  to_write += "\",\""; \
152  to_write += translate_web_chars(url_string); \
153  to_write += "\"\n"; \
154  output_file.write(to_write); \
155  _link_count++; \
156 }
157 //was after second clean up naughty
158 /*argh yuck... if (url_string.ends(name_string)) { \
159  / * handle the name being boring. replace with the intermediate text. * / \
160  MAKE_MORE_ENGLISH(intermediate_text); \
161  strain_out_html_codes(intermediate_text); \
162  CLEAN_UP_NAUGHTY(intermediate_text); \
163  if (intermediate_text.length()) \
164  name_string = intermediate_text; \
165  } \
166 */
167 
168 // writes out the current section in the HOOPLE format.
169 // currently the parent category is set to Root.
170 #define WRITE_SECTION { \
171  CLEAN_UP_NAUGHTY(last_heading); /* clean the name. */ \
172  /* output a category definition. */ \
173  astring to_write = "\"C\",\""; \
174  to_write += translate_web_chars(last_heading); \
175  to_write += "\",\""; \
176  to_write += abbreviate_category(last_parents.top()); \
177  to_write += "\"\n"; \
178  output_file.write(to_write); \
179  _category_count++; \
180 }
181 
182 // clears our accumulator strings.
183 #define RESET_STRINGS { \
184  url_string = astring::empty_string(); \
185  name_string = astring::empty_string(); \
186  intermediate_text = astring::empty_string(); \
187 }
188 
190 
191 class link_parser : public application_shell
192 {
193 public:
194  link_parser();
195  DEFINE_CLASS_NAME("link_parser");
196  virtual int execute();
197  int print_instructions(const filename &program_name);
198 
199 private:
200  int _link_count; // number of links.
201  int _category_count; // number of categories.
202 
203  astring url_string; // the URL we've parsed.
204  astring name_string; // the name that we've parsed for the URL.
205  astring last_heading; // the last name that was set for a section.
206  stack<astring> last_parents; // the history of the parent names.
207  astring intermediate_text; // strings we saw before a link.
208 
209  astring heading_num;
210  // this string form of a number tracks what kind of heading was started.
211 
212  astring abbreviate_category(const astring &simplify);
213  // returns the inner category nickname if the category has one.
214 
215  astring translate_web_chars(const astring &vervoom);
216  // translates a few web chars that are safe for csv back into their non-encoded form.
217 };
218 
220 
221 link_parser::link_parser()
223  _link_count(0),
224  _category_count(0),
225  last_heading("Root"),
226  last_parents()
227 {
228  last_parents.push(last_heading); // make sure we have at least one level.
229 }
230 
231 int link_parser::print_instructions(const filename &program_name)
232 {
233  a_sprintf to_show("%s:\n\
234 This program needs two filenames as command line parameters. The -i flag\n\
235 is used to specify the input filename and the -o flag specifies the output\n\
236 file to be created. The input file is expected to be an html file\n\
237 containing links to assorted web sites. The links are gathered, along with\n\
238 descriptive text that happens to be near them, to create a link database in\n\
239 the HOOPLE link format and write it to the output file. HOOPLE link format\n\
240 is basically a CSV file that defines the columns 1-4 for describing either\n\
241 link categories (which support hierarchies) or actual links (i.e., URLs of\n\
242 interest). The links are written to a CSV file in the standard HOOPLE link\n\
243 The HOOPLE link format is documented here:\n\
244  http://feistymeow.org/guides/link_database/format_manifesto.txt\n\
245 ", program_name.basename().raw().s(), program_name.basename().raw().s());
246  program_wide_logger::get().log(to_show, ALWAYS_PRINT);
247  return 12;
248 }
249 
250 astring link_parser::abbreviate_category(const astring &simplify)
251 {
252  astring to_return;
253  astring name_portion;
254  bookmark_tree::break_name(simplify, name_portion, to_return);
255  if (!to_return) return name_portion;
256  return to_return;
257 }
258 
259 astring link_parser::translate_web_chars(const astring &vervoom)
260 {
261  astring to_return = vervoom;
262  to_return.replace_all("&amp;", "&");
263  to_return.replace_all("&auml;", "ä");
264  to_return.replace_all("&copy;", "(c)");
265  to_return.replace_all("&eacute;", "é");
266  to_return.replace_all("&laquo;", "--");
267  to_return.replace_all("&lsquo;", "'");
268  to_return.replace_all("&ldquo;", "'");
269  to_return.replace_all("&mdash;", "--");
270  to_return.replace_all("&ndash;", "--");
271  to_return.replace_all("&nbsp;", " ");
272  to_return.replace_all("&raquo;", "--");
273  to_return.replace_all("&rdquo;", "'");
274  to_return.replace_all("&rsquo;", "'");
275 
276  to_return.replace_all("%7E", "~");
277  to_return.replace_all("%28", "(");
278  to_return.replace_all("%29", ")");
279  return to_return;
280 }
281 
282 int link_parser::execute()
283 {
284  FUNCDEF("main");
285  command_line cmds(_global_argc, _global_argv); // process the command line parameters.
286  astring input_filename; // we'll store our bookmarks file's name here.
287  astring output_filename; // where the processed marks go.
288  if (!cmds.get_value('i', input_filename, false))
289  return print_instructions(cmds.program_name());
290  if (!cmds.get_value('o', output_filename, false))
291  return print_instructions(cmds.program_name());
292 
293  BASE_LOG(astring("input file: ") + input_filename);
294  BASE_LOG(astring("output file: ") + output_filename);
295 
296  astring full_contents;
297  byte_filer input_file(input_filename, "r");
298  if (!input_file.good())
299  non_continuable_error(class_name(), func, "the input file could not be opened");
300  input_file.read(full_contents, MAX_FILE_SIZE);
301  input_file.close();
302 
303  filename outname(output_filename);
304  if (outname.exists()) {
305  non_continuable_error(class_name(), func, astring("the output file ")
306  + output_filename + " already exists. It would be over-written if "
307  "we continued.");
308  }
309 
310  byte_filer output_file(output_filename, "w");
311  if (!output_file.good())
312  non_continuable_error(class_name(), func, "the output file could not be opened");
313 
314  enum parsing_states {
315  // the states below are order dependent; do not change the ordering!
316  SEEKING_LINK_START, // looking for the beginning of an html link.
317  SEEKING_HREF, // finding the href portion of the link.
318  GETTING_URL, // chowing on the URL portion of the link.
319  SEEKING_NAME, // finding the closing bracket of the <a ...>.
320  GETTING_NAME, // chowing down on characters in the link's name.
321  SEEKING_CLOSURE, // looking for the </a> to end the link.
322  // there is a discontinuity after SEEKING_CLOSURE, but then the following
323  // states are also order dependent.
324  SAW_TITLE_START, // the beginning of a section heading was seen.
325  GETTING_TITLE, // grabbing characters in the title.
326  // new text processing states.
327  SAW_NESTING_INCREASE, // a new nesting level has been achieved.
328  SAW_NESTING_DECREASE, // we exited from a former nesting level.
329  };
330 
331  int curr_index = 0;
332  parsing_states state = SEEKING_LINK_START;
333  while (curr_index < full_contents.length()) {
334  switch (state) {
335  case SEEKING_LINK_START:
336  // if we don't see a less-than, then it's not the start of html code,
337  // so we'll ignore it for now.
338  if (full_contents[curr_index] != '<') {
340  INCREM_N_GO;
341  }
342  // found a left angle bracket, so now we need to decided where to go next for parsing
343  // the html coming up.
344  curr_index++;
345  // see if this is a heading. if so, we can snag the heading name.
346  if (caseless_equals('h', full_contents[curr_index])) {
347 #ifdef DEBUG_LINK_PARSER
348  LOG("into the '<h' case");
349 #endif
350  // check that we're seeing a heading definition here.
351  char next = full_contents[curr_index + 1];
352  if ( (next >= '0') && (next <= '9') ) {
353  // we found our proper character for starting a heading. we need
354  // to jump into that state now. we'll leave the cursor at the
355  // beginning of the number.
356  state = SAW_TITLE_START;
357  INCREM_N_GO;
358  }
359  }
360  // check if they're telling us a new indentation level of the type we care about.
361  if (caseless_equals('d', full_contents[curr_index])) {
362 #ifdef DEBUG_LINK_PARSER
363  LOG("into the '<d' case");
364 #endif
365  // see if they gave us a <dl> tag.
366  char next = full_contents[curr_index + 1];
367  if (caseless_equals(next, 'l')) {
368 #ifdef DEBUG_LINK_PARSER
369  LOG("into the '<dl' case");
370 #endif
371  state = SAW_NESTING_INCREASE;
372  INCREM_N_GO;
373  }
374  }
375  // see if we can find a close for a nesting level.
376  if (caseless_equals('/', full_contents[curr_index])) {
377 #ifdef DEBUG_LINK_PARSER
378  LOG("into the '</' case");
379 #endif
380  // see if they gave us a <dl> tag.
381  if ( caseless_equals(full_contents[curr_index + 1], 'd')
382  && caseless_equals(full_contents[curr_index + 2], 'l') ) {
383 #ifdef DEBUG_LINK_PARSER
384  LOG("into the '</dl' case");
385 #endif
386  state = SAW_NESTING_DECREASE;
387  INCREM_N_GO;
388  }
389  }
390  // see if it's not a link, and abandon ship if it's not, since that's the last option
391  // for html code that we parse.
392  if (!caseless_equals('a', full_contents[curr_index])) {
393 #ifdef DEBUG_LINK_PARSER
394  LOG("into the not an '<a' case");
395 #endif
396 // intermediate_text += '<';
397  JUMP_TO_CHAR('>', false);
398  continue;
399  }
400 #ifdef DEBUG_LINK_PARSER
401  LOG("into the final case, the '<a' case");
402 #endif
403  // found an a, but make sure that's the only character in the word.
404  curr_index++;
405  if (!parser_bits::white_space(full_contents[curr_index])) {
406 // intermediate_text += "<a";
407  JUMP_TO_CHAR('>', false);
408  continue;
409  }
410  // this looks like an address so find the start of the href.
412  break;
413  case SAW_NESTING_INCREASE:
414  last_parents.push(last_heading);
415 #ifdef DEBUG_LINK_PARSER
416  LOG(a_sprintf("nesting inwards, new depth %d", last_parents.size()));
417 #endif
418  JUMP_TO_CHAR('>', false);
419  state = SEEKING_LINK_START;
420  break;
421  case SAW_NESTING_DECREASE:
422  last_parents.pop();
423 #ifdef DEBUG_LINK_PARSER
424  LOG(a_sprintf("nesting outwards, new depth %d", last_parents.size()));
425 #endif
426  JUMP_TO_CHAR('>', false);
427  state = SEEKING_LINK_START;
428  break;
429  case SEEKING_HREF:
430  JUMP_TO_CHAR('h', false); // find the next 'h' for "href".
431  curr_index++;
432  if (!caseless_equals('r', full_contents[curr_index])) continue;
433  curr_index++;
434  if (!caseless_equals('e', full_contents[curr_index])) continue;
435  curr_index++;
436  if (!caseless_equals('f', full_contents[curr_index])) continue;
437  curr_index++;
438  if (full_contents[curr_index] != '=') continue;
439  curr_index++;
440  if (full_contents[curr_index] != '"') continue;
441  // whew, got through the word href and the assignment. the rest
442  // should all be part of the link.
444  break;
445  case GETTING_URL:
446  // as long as we don't see the closure of the quoted string for the
447  // href, then we can keep accumulating characters from it.
448  if (full_contents[curr_index] == '"') NEXT_STATE_INCREM;
449  url_string += full_contents[curr_index];
450  INCREM_N_GO; // keep chewing on it in this same state.
451  break;
452  case SEEKING_NAME:
453  JUMP_TO_CHAR('>', false); // find closing bracket.
454  NEXT_STATE_INCREM; // now start grabbing the name characters.
455  break;
456  case GETTING_NAME:
457  // we have to stop grabbing name characters when we spy a new code
458  // being started.
459  if (full_contents[curr_index] == '<') {
460  // if we see a closing command, then we assume it's the one we want.
461  if (full_contents[curr_index + 1] == '/')
463  // if we see html inside the name, we just throw it out.
464  JUMP_TO_CHAR('>', false);
465  curr_index++;
466  continue;
467  }
468  name_string += full_contents[curr_index];
469  INCREM_N_GO; // keep chewing on it in this same state.
470  break;
471  case SEEKING_CLOSURE:
472  JUMP_TO_CHAR('>', false); // find the closure of the html code.
473  // write the link out now.
474  WRITE_LINK;
475  // clean out our accumulated strings.
477  state = SEEKING_LINK_START;
478  INCREM_N_GO;
479  break;
480  case SAW_TITLE_START:
481  heading_num = full_contents.substring(curr_index, curr_index);
482  JUMP_TO_CHAR('>', false);
483  NEXT_STATE_INCREM; // start eating the name.
484  break;
485  case GETTING_TITLE: {
486  int indy = full_contents.find('<', curr_index);
487  if (negative(indy)) {
488  state = SEEKING_LINK_START; // too weird, go back to start.
489  continue;
490  }
491  // push the last title if it differs from the top of the stack.
492  last_heading = full_contents.substring(curr_index, indy - 1);
494  JUMP_TO_CHAR('<', false); // now find the start of the header closure.
495  JUMP_TO_CHAR('>', false); // now find the end of the header closure.
497  state = SEEKING_LINK_START; // successfully found section name.
498  break;
499  }
500  default:
501  non_continuable_error(class_name(), func, "entered erroneous state!");
502  }
503  }
504 
505  if (url_string.t()) WRITE_LINK;
506 
507  output_file.close();
508 
509  BASE_LOG(a_sprintf("wrote %d links in %d categories.", _link_count,
510  _category_count));
511 
512  return 0;
513 }
514 
516 
517 HOOPLE_MAIN(link_parser, )
518 
int print_instructions(bool good, const astring &program_name)
Definition: checker.cpp:45
The application_shell is a base object for console programs.
a_sprintf is a specialization of astring that provides printf style support.
Definition: astring.h:440
Provides a dynamically resizable ASCII character string.
Definition: astring.h:35
const char * s() const
synonym for observe. the 's' stands for "string", if that helps.
Definition: astring.h:113
virtual void zap(int start, int end)
Deletes the characters between "start" and "end" inclusively.
Definition: astring.cpp:521
bool substring(astring &target, int start, int end) const
a version that stores the substring in an existing "target" string.
Definition: astring.cpp:865
bool replace_all(char to_replace, char new_char)
changes all occurrences of "to_replace" with "new_char".
Definition: astring.cpp:929
int length() const
Returns the current length of the string.
Definition: astring.cpp:132
int find(char to_find, int position=0, bool reverse=false) const
Locates "to_find" in "this".
Definition: astring.cpp:574
static void break_name(const basis::astring &to_break, basis::astring &name, basis::astring &nick)
Provides file managment services using the standard I/O support.
Definition: byte_filer.h:32
Provides operations commonly needed on file names.
Definition: filename.h:64
const basis::astring & raw() const
returns the astring that we're holding onto for the path.
Definition: filename.cpp:97
filename basename() const
returns the base of the filename; no directory.
Definition: filename.cpp:385
#define non_continuable_error(c, f, i)
an extra piece of information used, if available, in bounds_halt below.
#define DEFINE_CLASS_NAME(objname)
Defines the name of a class by providing a couple standard methods.
Definition: enhance_cpp.h:45
#define FUNCDEF(func_in)
FUNCDEF sets the name of a function (and plugs it into the callstack).
Definition: enhance_cpp.h:57
Provides macros that implement the 'main' program of an application.
#define HOOPLE_MAIN(obj_name, obj_args)
options that should work for most unix and linux apps.
Definition: hoople_main.h:61
Implements an application lock to ensure only one is running at once.
char ** _global_argv
The guards collection helps in testing preconditions and reporting errors.
Definition: array.h:30
const int MEGABYTE
Number of bytes in a megabyte.
Definition: definitions.h:135
bool negative(const type &a)
negative returns true if "a" is less than zero.
Definition: functions.h:43
A platform independent way to obtain the timestamp of a file.
Definition: byte_filer.cpp:37
A logger that sends to the console screen using the standard output device.
A dynamic container class that holds any kind of object via pointers.
Definition: amorph.h:55