core/applications/bookmark_tools/link_parser.cpp

   1 /*****************************************************************************\
   2 *                                                                             *
   3 *  Name   : link_parser                                                       *
   4 *  Author : Chris Koeritz                                                     *
   5 *                                                                             *
   6 *  Purpose:                                                                   *
   7 *                                                                             *
   8 *    Processes html files and finds the links.  A database in the HOOPLE      *
   9 *  link format is created from the links found.                               *
  10 *                                                                             *
  11 *******************************************************************************
  12 * Copyright (c) 1991-$now By Author.  This program is free software; you can  *
  13 * redistribute it and/or modify it under the terms of the GNU General Public  *
  14 * License as published by the Free Software Foundation; either version 2 of   *
  15 * the License or (at your option) any later version.  This is online at:      *
  16 *     http://www.fsf.org/copyleft/gpl.html                                    *
  17 * Please send any updates to: fred@gruntose.com                               *
  18 \*****************************************************************************/
  19
  20 // Notes:
  21 //
  22 // the standard link structure in html is similar to this:
  23 //     <a href="blahblah">Link Name and Launching Point</a>
  24 //
  25 // the standard we adopt for section titles is that it must be a heading
  26 // marker.  that formatting looks like this, for example:
  27 //     <h3 assorted_stuff>The Section Title:</h3>
  28
  29 #include "bookmark_tree.h"
  30
  31 #include <application/hoople_main.h>
  32 #include <basis/astring.h>
  33 #include <basis/functions.h>
  34 #include <basis/guards.h>
  35 #include <filesystem/byte_filer.h>
  36 #include <filesystem/filename.h>
  37 #include <loggers/critical_events.h>
  38 #include <loggers/file_logger.h>
  39 #include <structures/stack.h>
  40 #include <structures/static_memory_gremlin.h>
  41 #include <textual/parser_bits.h>
  42
  43 using namespace application;
  44 using namespace basis;
  45 using namespace filesystem;
  46 using namespace loggers;
  47 using namespace structures;
  48 using namespace textual;
  49
  50 #undef BASE_LOG
  51 #define BASE_LOG(s) program_wide_logger::get().log(s, ALWAYS_PRINT)
  52 #undef LOG
  53 #define LOG(s) CLASS_EMERGENCY_LOG(program_wide_logger::get(), s)
  54
  55 //#define DEBUG_LINK_PARSER
  56   // uncomment for noisier run to seek problems.
  57
  58 ////////////////////////////////////////////////////////////////////////////
  59
  60 const int MAX_FILE_SIZE = 4 * MEGABYTE;
  61   // this is the largest html file size we will process.
  62
  63 ////////////////////////////////////////////////////////////////////////////
  64
  65 // a macro that increments the position in the string and restarts the loop.
  66 #define INCREM_N_GO { curr_index++; continue; }
  67
  68 // puts the current character on the intermediate string.
  69 #define ADD_INTERMEDIATE { \
  70   char add_in = full_contents[curr_index]; \
  71   if ( (add_in == '<') || (add_in == '>') ) { \
  72     add_in = '-'; \
  73   } \
  74   intermediate_text += add_in; \
  75 }
  76
  77 // returns a character in lower-case, if 'a' is in upper case.
  78 char normalize_char(char a)
  79 {
  80   if ( (a >= 'A') && (a <= 'Z') ) return a + 'a' - 'A';
  81   return a;
  82 }
  83
  84 // returns true if the two characters are the same, ignoring upper/lower case.
  85 bool caseless_equals(char a, char b) { return normalize_char(a) == normalize_char(b); }
  86
  87 // a macro that skips all characters until the specified one is seen.
  88 #define JUMP_TO_CHAR(to_find, save_them) { \
  89   while ( (curr_index < full_contents.length()) \
  90       && !caseless_equals(to_find, full_contents[curr_index]) ) { \
  91     if (save_them) ADD_INTERMEDIATE; \
  92     curr_index++; \
  93   } \
  94 }
  95
  96 // increments the state, the current character and restarts the loop.
  97 #define NEXT_STATE_INCREM { \
  98   state = parsing_states(state+1);  /* move forward in states. */ \
  99   curr_index++; \
 100   continue; \
 101 }
 102
 103 // cleans out the disallowed characters in the string provided.
 104 #define CLEAN_UP_NAUGHTY(s) { \
 105   while (s.replace("\n", " ")) {} \
 106   while (s.replace("\r", "")) {} \
 107   s.strip_spaces(); \
 108 }
 109
 110 //was before the strip_spaces code above.
 111 /*
 112   int indy = s.find("--"); \
 113   while (non_negative(indy)) { \
 114     s[indy] = ' ';  / * replace the first dash with a space. * / \
 115     for (int i = indy + 1; i < s.length(); i++) { \
 116       if (s[i] != '-') break; \
 117       s.zap(i, i); \
 118       i--; \
 119     } \
 120     indy = s.find("--"); \
 121   } \
 122   while (s.replace("  ", " ")) {} \
 123 */
 124
 125 // cleans up underscores in areas that are supposed to be english.
 126 #define MAKE_MORE_ENGLISH(s) \
 127   s.replace_all('_', ' ')
 128
 129 void strain_out_html_codes(astring &to_edit)
 130 {
 131   for (int i = 0; i < to_edit.length(); i++) {
 132     if (to_edit[i] != '<') continue;
 133     // found a left bracket.
 134     int indy = to_edit.find('>', i);
 135     if (negative(indy)) return;  // bail out, unexpected unmatched bracket.
 136     to_edit.zap(i, indy);
 137     i--;  // skip back to reconsider current place.
 138   }
 139 }
 140
 141 // writes out the currently accumulated link info.
 142 #define WRITE_LINK { \
 143   /* clean naughty characters out of the names. */ \
 144   CLEAN_UP_NAUGHTY(url_string); \
 145   CLEAN_UP_NAUGHTY(name_string); \
 146   /* output a link in the HOOPLE format. */ \
 147   astring to_write = "\"L\",\""; \
 148   to_write += translate_web_chars(name_string); \
 149   to_write += "\",\""; \
 150   to_write += abbreviate_category(last_heading); \
 151   to_write += "\",\""; \
 152   to_write += translate_web_chars(url_string); \
 153   to_write += "\"\n"; \
 154   output_file.write(to_write); \
 155   _link_count++; \
 156 }
 157 //was after second clean up naughty
 158 /*argh yuck...  if (url_string.ends(name_string)) { \
 159     / * handle the name being boring. replace with the intermediate text. * / \
 160     MAKE_MORE_ENGLISH(intermediate_text); \
 161     strain_out_html_codes(intermediate_text); \
 162     CLEAN_UP_NAUGHTY(intermediate_text); \
 163     if (intermediate_text.length()) \
 164       name_string = intermediate_text; \
 165   } \
 166 */
 167
 168 // writes out the current section in the HOOPLE format.
 169 // currently the parent category is set to Root.
 170 #define WRITE_SECTION { \
 171   CLEAN_UP_NAUGHTY(last_heading);  /* clean the name. */ \
 172   /* output a category definition. */ \
 173   astring to_write = "\"C\",\""; \
 174   to_write += translate_web_chars(last_heading); \
 175   to_write += "\",\""; \
 176   to_write += abbreviate_category(last_parents.top()); \
 177   to_write += "\"\n"; \
 178   output_file.write(to_write); \
 179   _category_count++; \
 180 }
 181
 182 // clears our accumulator strings.
 183 #define RESET_STRINGS { \
 184   url_string = astring::empty_string(); \
 185   name_string = astring::empty_string(); \
 186   intermediate_text = astring::empty_string(); \
 187 }
 188
 189 ////////////////////////////////////////////////////////////////////////////
 190
 191 class link_parser : public application_shell
 192 {
 193 public:
 194   link_parser();
 195   DEFINE_CLASS_NAME("link_parser");
 196   virtual int execute();
 197   int print_instructions(const filename &program_name);
 198
 199 private:
 200   int _link_count;  // number of links.
 201   int _category_count;  // number of categories.
 202
 203   astring url_string;  // the URL we've parsed.
 204   astring name_string;  // the name that we've parsed for the URL.
 205   astring last_heading;  // the last name that was set for a section.
 206   stack<astring> last_parents;  // the history of the parent names.
 207   astring intermediate_text;  // strings we saw before a link.
 208
 209   astring heading_num;
 210     // this string form of a number tracks what kind of heading was started.
 211
 212   astring abbreviate_category(const astring &simplify);
 213     // returns the inner category nickname if the category has one.
 214
 215   astring translate_web_chars(const astring &vervoom);
 216     // translates a few web chars that are safe for csv back into their non-encoded form.
 217 };
 218
 219 ////////////////////////////////////////////////////////////////////////////
 220
 221 link_parser::link_parser()
 222 : application_shell(),
 223   _link_count(0),
 224   _category_count(0),
 225   last_heading("Root"),
 226   last_parents()
 227 {
 228   last_parents.push(last_heading);  // make sure we have at least one level.
 229 }
 230
 231 int link_parser::print_instructions(const filename &program_name)
 232 {
 233   a_sprintf to_show("%s:\n\
 234 This program needs two filenames as command line parameters.  The -i flag\n\
 235 is used to specify the input filename and the -o flag specifies the output\n\
 236 file to be created.  The input file is expected to be an html file\n\
 237 containing links to assorted web sites.  The links are gathered, along with\n\
 238 descriptive text that happens to be near them, to create a link database in\n\
 239 the HOOPLE link format and write it to the output file.  HOOPLE link format\n\
 240 is basically a CSV file that defines the columns 1-4 for describing either\n\
 241 link categories (which support hierarchies) or actual links (i.e., URLs of\n\
 242 interest).  The links are written to a CSV file in the standard HOOPLE link\n\
 243 The HOOPLE link format is documented here:\n\
 244     http://hoople.org/guides/link_database/format_manifesto.txt\n\
 245 ", program_name.basename().raw().s(), program_name.basename().raw().s());
 246   program_wide_logger::get().log(to_show, ALWAYS_PRINT);
 247   return 12;
 248 }
 249
 250 astring link_parser::abbreviate_category(const astring &simplify)
 251 {
 252   astring to_return;
 253   astring name_portion;
 254   bookmark_tree::break_name(simplify, name_portion, to_return);
 255   if (!to_return) return name_portion;
 256   return to_return;
 257 }
 258
 259 astring link_parser::translate_web_chars(const astring &vervoom)
 260 {
 261   astring to_return = vervoom;
 262   to_return.replace_all("&amp;", "&");
 263   to_return.replace_all("&auml;", "ä");
 264   to_return.replace_all("&copy;", "(c)");
 265   to_return.replace_all("&eacute;", "é");
 266   to_return.replace_all("&laquo;", "--");
 267   to_return.replace_all("&lsquo;", "'");
 268   to_return.replace_all("&ldquo;", "'");
 269   to_return.replace_all("&mdash;", "--");
 270   to_return.replace_all("&ndash;", "--");
 271   to_return.replace_all("&nbsp;", " ");
 272   to_return.replace_all("&raquo;", "--");
 273   to_return.replace_all("&rdquo;", "'");
 274   to_return.replace_all("&rsquo;", "'");
 275
 276   to_return.replace_all("%7E", "~");
 277   to_return.replace_all("%28", "(");
 278   to_return.replace_all("%29", ")");
 279   return to_return;
 280 }
 281
 282 int link_parser::execute()
 283 {
 284   FUNCDEF("main");
 285   command_line cmds(_global_argc, _global_argv);  // process the command line parameters.
 286   astring input_filename;  // we'll store our bookmarks file's name here.
 287   astring output_filename;  // where the processed marks go.
 288   if (!cmds.get_value('i', input_filename, false))
 289     return print_instructions(cmds.program_name());
 290   if (!cmds.get_value('o', output_filename, false))
 291     return print_instructions(cmds.program_name());
 292
 293   BASE_LOG(astring("input file: ") + input_filename);
 294   BASE_LOG(astring("output file: ") + output_filename);
 295
 296   astring full_contents;
 297   byte_filer input_file(input_filename, "r");
 298   if (!input_file.good())
 299     non_continuable_error(class_name(), func, "the input file could not be opened");
 300   input_file.read(full_contents, MAX_FILE_SIZE);
 301   input_file.close();
 302
 303   filename outname(output_filename);
 304   if (outname.exists()) {
 305     non_continuable_error(class_name(), func, astring("the output file ")
 306         + output_filename + " already exists.  It would be over-written if "
 307         "we continued.");
 308   }
 309
 310   byte_filer output_file(output_filename, "w");
 311   if (!output_file.good())
 312     non_continuable_error(class_name(), func, "the output file could not be opened");
 313
 314   enum parsing_states {
 315     // the states below are order dependent; do not change the ordering!
 316     SEEKING_LINK_START,  // looking for the beginning of an html link.
 317     SEEKING_HREF,  // finding the href portion of the link.
 318     GETTING_URL,  // chowing on the URL portion of the link.
 319     SEEKING_NAME,  // finding the closing bracket of the <a ...>.
 320     GETTING_NAME,  // chowing down on characters in the link's name.
 321     SEEKING_CLOSURE,  // looking for the </a> to end the link.
 322     // there is a discontinuity after SEEKING_CLOSURE, but then the following
 323     // states are also order dependent.
 324     SAW_TITLE_START,  // the beginning of a section heading was seen.
 325     GETTING_TITLE,  // grabbing characters in the title.
 326     // new text processing states.
 327     SAW_NESTING_INCREASE,  // a new nesting level has been achieved.
 328     SAW_NESTING_DECREASE,  // we exited from a former nesting level.
 329   };
 330
 331   int curr_index = 0;
 332   parsing_states state = SEEKING_LINK_START;
 333   while (curr_index < full_contents.length()) {
 334     switch (state) {
 335       case SEEKING_LINK_START:
 336         // if we don't see a less-than, then it's not the start of html code,
 337         // so we'll ignore it for now.
 338         if (full_contents[curr_index] != '<') {
 339           ADD_INTERMEDIATE;
 340           INCREM_N_GO;
 341         }
 342         // found a left angle bracket, so now we need to decided where to go next for parsing
 343         // the html coming up.
 344         curr_index++;
 345         // see if this is a heading.  if so, we can snag the heading name.
 346         if (caseless_equals('h', full_contents[curr_index])) {
 347 #ifdef DEBUG_LINK_PARSER
 348           LOG("into the '<h' case");
 349 #endif
 350           // check that we're seeing a heading definition here.
 351           char next = full_contents[curr_index + 1];
 352           if ( (next >= '0') && (next <= '9') ) {
 353             // we found our proper character for starting a heading.  we need
 354             // to jump into that state now.  we'll leave the cursor at the
 355             // beginning of the number.
 356             state = SAW_TITLE_START;
 357             INCREM_N_GO;
 358           }
 359         }
 360         // check if they're telling us a new indentation level of the type we care about.
 361         if (caseless_equals('d', full_contents[curr_index])) {
 362 #ifdef DEBUG_LINK_PARSER
 363           LOG("into the '<d' case");
 364 #endif
 365           // see if they gave us a <dl> tag.
 366           char next = full_contents[curr_index + 1];
 367           if (caseless_equals(next, 'l')) {
 368 #ifdef DEBUG_LINK_PARSER
 369             LOG("into the '<dl' case");
 370 #endif
 371             state = SAW_NESTING_INCREASE;
 372             INCREM_N_GO;
 373           }
 374         }
 375         // see if we can find a close for a nesting level.
 376         if (caseless_equals('/', full_contents[curr_index])) {
 377 #ifdef DEBUG_LINK_PARSER
 378           LOG("into the '</' case");
 379 #endif
 380           // see if they gave us a <dl> tag.
 381           if ( caseless_equals(full_contents[curr_index + 1], 'd')
 382               && caseless_equals(full_contents[curr_index + 2], 'l') ) {
 383 #ifdef DEBUG_LINK_PARSER
 384               LOG("into the '</dl' case");
 385 #endif
 386             state = SAW_NESTING_DECREASE;
 387             INCREM_N_GO;
 388           }
 389         }
 390         // see if it's not a link, and abandon ship if it's not, since that's the last option
 391         // for html code that we parse.
 392         if (!caseless_equals('a', full_contents[curr_index])) {
 393 #ifdef DEBUG_LINK_PARSER
 394           LOG("into the not an '<a' case");
 395 #endif
 396 //          intermediate_text += '<';
 397           JUMP_TO_CHAR('>', false);
 398           continue;
 399         }
 400 #ifdef DEBUG_LINK_PARSER
 401         LOG("into the final case, the '<a' case");
 402 #endif
 403         // found an a, but make sure that's the only character in the word.
 404         curr_index++;
 405         if (!parser_bits::white_space(full_contents[curr_index])) {
 406 //          intermediate_text += "<a";
 407           JUMP_TO_CHAR('>', false);
 408           continue;
 409         }
 410         // this looks like an address so find the start of the href.
 411         NEXT_STATE_INCREM;
 412         break;
 413       case SAW_NESTING_INCREASE:
 414         last_parents.push(last_heading);
 415 #ifdef DEBUG_LINK_PARSER
 416         LOG(a_sprintf("nesting inwards, new depth %d", last_parents.size()));
 417 #endif
 418         JUMP_TO_CHAR('>', false);
 419         state = SEEKING_LINK_START;
 420         break;
 421       case SAW_NESTING_DECREASE:
 422         last_parents.pop();
 423 #ifdef DEBUG_LINK_PARSER
 424         LOG(a_sprintf("nesting outwards, new depth %d", last_parents.size()));
 425 #endif
 426         JUMP_TO_CHAR('>', false);
 427         state = SEEKING_LINK_START;
 428         break;
 429       case SEEKING_HREF:
 430         JUMP_TO_CHAR('h', false);  // find the next 'h' for "href".
 431         curr_index++;
 432         if (!caseless_equals('r', full_contents[curr_index])) continue;
 433         curr_index++;
 434         if (!caseless_equals('e', full_contents[curr_index])) continue;
 435         curr_index++;
 436         if (!caseless_equals('f', full_contents[curr_index])) continue;
 437         curr_index++;
 438         if (full_contents[curr_index] != '=') continue;
 439         curr_index++;
 440         if (full_contents[curr_index] != '"') continue;
 441         // whew, got through the word href and the assignment.  the rest
 442         // should all be part of the link.
 443         NEXT_STATE_INCREM;
 444         break;
 445       case GETTING_URL:
 446         // as long as we don't see the closure of the quoted string for the
 447         // href, then we can keep accumulating characters from it.
 448         if (full_contents[curr_index] == '"') NEXT_STATE_INCREM;
 449         url_string += full_contents[curr_index];
 450         INCREM_N_GO;  // keep chewing on it in this same state.
 451         break;
 452       case SEEKING_NAME:
 453         JUMP_TO_CHAR('>', false);  // find closing bracket.
 454         NEXT_STATE_INCREM;  // now start grabbing the name characters.
 455         break;
 456       case GETTING_NAME:
 457         // we have to stop grabbing name characters when we spy a new code
 458         // being started.
 459         if (full_contents[curr_index] == '<') {
 460           // if we see a closing command, then we assume it's the one we want.
 461           if (full_contents[curr_index + 1] == '/')
 462             NEXT_STATE_INCREM;
 463           // if we see html inside the name, we just throw it out.
 464           JUMP_TO_CHAR('>', false);
 465           curr_index++;
 466           continue;
 467         }
 468         name_string += full_contents[curr_index];
 469         INCREM_N_GO;  // keep chewing on it in this same state.
 470         break;
 471       case SEEKING_CLOSURE:
 472         JUMP_TO_CHAR('>', false);  // find the closure of the html code.
 473         // write the link out now.
 474         WRITE_LINK;
 475         // clean out our accumulated strings.
 476         RESET_STRINGS;
 477         state = SEEKING_LINK_START;
 478         INCREM_N_GO;
 479         break;
 480       case SAW_TITLE_START:
 481         heading_num = full_contents.substring(curr_index, curr_index);
 482         JUMP_TO_CHAR('>', false);
 483         NEXT_STATE_INCREM;  // start eating the name.
 484         break;
 485       case GETTING_TITLE: {
 486         int indy = full_contents.find('<', curr_index);
 487         if (negative(indy)) {
 488           state = SEEKING_LINK_START;  // too weird, go back to start.
 489           continue;
 490         }
 491         // push the last title if it differs from the top of the stack.
 492         last_heading = full_contents.substring(curr_index, indy - 1);
 493         WRITE_SECTION;
 494         JUMP_TO_CHAR('<', false);  // now find the start of the header closure.
 495         JUMP_TO_CHAR('>', false);  // now find the end of the header closure.
 496         RESET_STRINGS;
 497         state = SEEKING_LINK_START;  // successfully found section name.
 498         break;
 499       }
 500       default:
 501         non_continuable_error(class_name(), func, "entered erroneous state!");
 502     }
 503   }
 504
 505   if (url_string.t()) WRITE_LINK;
 506
 507   output_file.close();
 508
 509   BASE_LOG(a_sprintf("wrote %d links in %d categories.", _link_count,
 510       _category_count));
 511
 512   return 0;
 513 }
 514
 515 ////////////////////////////////////////////////////////////////////////////
 516
 517 HOOPLE_MAIN(link_parser, )
 518