core/applications/bookmark_tools/link_parser.cpp

   1 /*****************************************************************************\
   2 *                                                                             *
   3 *  Name   : link_parser                                                       *
   4 *  Author : Chris Koeritz                                                     *
   5 *                                                                             *
   6 *  Purpose:                                                                   *
   7 *                                                                             *
   8 *    Processes html files and finds the links.  A database in the HOOPLE      *
   9 *  link format is created from the links found.                               *
  10 *                                                                             *
  11 *******************************************************************************
  12 * Copyright (c) 1991-$now By Author.  This program is free software; you can  *
  13 * redistribute it and/or modify it under the terms of the GNU General Public  *
  14 * License as published by the Free Software Foundation; either version 2 of   *
  15 * the License or (at your option) any later version.  This is online at:      *
  16 *     http://www.fsf.org/copyleft/gpl.html                                    *
  17 * Please send any updates to: fred@gruntose.com                               *
  18 \*****************************************************************************/
  19
  20 // Notes:
  21 //
  22 // the standard link structure in html is similar to this:
  23 //     <a href="blahblah">Link Name and Launching Point</a>
  24 //
  25 // the standard we adopt for section titles is that it must be a heading
  26 // marker.  that formatting looks like this, for example:
  27 //     <h3 assorted_stuff>The Section Title:</h3>
  28
  29 #include "bookmark_tree.h"
  30
  31 #include <application/hoople_main.h>
  32 #include <basis/astring.h>
  33 #include <basis/functions.h>
  34 #include <basis/guards.h>
  35 #include <filesystem/byte_filer.h>
  36 #include <filesystem/filename.h>
  37 #include <loggers/critical_events.h>
  38 #include <loggers/file_logger.h>
  39 #include <structures/stack.h>
  40 #include <structures/static_memory_gremlin.h>
  41 #include <textual/parser_bits.h>
  42
  43 using namespace application;
  44 using namespace basis;
  45 using namespace filesystem;
  46 using namespace loggers;
  47 using namespace structures;
  48 using namespace textual;
  49
  50 #undef BASE_LOG
  51 #define BASE_LOG(s) program_wide_logger::get().log(s, ALWAYS_PRINT)
  52 #undef LOG
  53 #define LOG(s) CLASS_EMERGENCY_LOG(program_wide_logger::get(), s)
  54
  55 //#define DEBUG_LINK_PARSER
  56   // uncomment for noisier run to seek problems.
  57
  58 ////////////////////////////////////////////////////////////////////////////
  59
  60 const int MAX_FILE_SIZE = 4 * MEGABYTE;
  61   // this is the largest html file size we will process.
  62
  63 ////////////////////////////////////////////////////////////////////////////
  64
  65 // a macro that increments the position in the string and restarts the loop.
  66 #define INCREM_N_GO { curr_index++; continue; }
  67
  68 // puts the current character on the intermediate string.
  69 #define ADD_INTERMEDIATE \
  70   intermediate_text += full_contents[curr_index]
  71
  72 // returns a character in lower-case, if 'a' is in upper case.
  73 char normalize_char(char a)
  74 {
  75   if ( (a >= 'A') && (a <= 'Z') ) return a + 'a' - 'A';
  76   return a;
  77 }
  78
  79 // returns true if the two characters are the same, ignoring upper/lower case.
  80 bool caseless_equals(char a, char b) { return normalize_char(a) == normalize_char(b); }
  81
  82 // a macro that skips all characters until the specified one is seen.
  83 #define JUMP_TO_CHAR(to_find, save_them) { \
  84   while ( (curr_index < full_contents.length()) \
  85       && !caseless_equals(to_find, full_contents[curr_index]) ) { \
  86     if (save_them) ADD_INTERMEDIATE; \
  87     curr_index++; \
  88   } \
  89 }
  90
  91 // increments the state, the current character and restarts the loop.
  92 #define NEXT_STATE_INCREM { \
  93   state = parsing_states(state+1);  /* move forward in states. */ \
  94   curr_index++; \
  95   continue; \
  96 }
  97
  98 // cleans out the disallowed characters in the string provided.
  99 #define CLEAN_UP_NAUGHTY(s) { \
 100   while (s.replace("\n", " ")) {} \
 101   while (s.replace("\r", "")) {} \
 102   s.strip_spaces(); \
 103 }
 104
 105 //was before the strip_spaces code above.
 106 /*
 107   int indy = s.find("--"); \
 108   while (non_negative(indy)) { \
 109     s[indy] = ' ';  / * replace the first dash with a space. * / \
 110     for (int i = indy + 1; i < s.length(); i++) { \
 111       if (s[i] != '-') break; \
 112       s.zap(i, i); \
 113       i--; \
 114     } \
 115     indy = s.find("--"); \
 116   } \
 117   while (s.replace("  ", " ")) {} \
 118 */
 119
 120 // cleans up underscores in areas that are supposed to be english.
 121 #define MAKE_MORE_ENGLISH(s) \
 122   s.replace_all('_', ' ')
 123
 124 void strain_out_html_codes(astring &to_edit)
 125 {
 126   for (int i = 0; i < to_edit.length(); i++) {
 127     if (to_edit[i] != '<') continue;
 128     // found a left bracket.
 129     int indy = to_edit.find('>', i);
 130     if (negative(indy)) return;  // bail out, unexpected unmatched bracket.
 131     to_edit.zap(i, indy);
 132     i--;  // skip back to reconsider current place.
 133   }
 134 }
 135
 136 // writes out the currently accumulated link info.
 137 #define WRITE_LINK { \
 138   /* clean naughty characters out of the names. */ \
 139   CLEAN_UP_NAUGHTY(url_string); \
 140   CLEAN_UP_NAUGHTY(name_string); \
 141   if (url_string.ends(name_string)) { \
 142     /* handle the name being boring. replace with the intermediate text. */ \
 143     MAKE_MORE_ENGLISH(intermediate_text); \
 144     strain_out_html_codes(intermediate_text); \
 145     CLEAN_UP_NAUGHTY(intermediate_text); \
 146     if (intermediate_text.length()) \
 147       name_string = intermediate_text; \
 148   } \
 149   /* output a link in the HOOPLE format. */ \
 150   astring to_write = "\"L\",\""; \
 151   to_write += translate_web_chars(name_string); \
 152   to_write += "\",\""; \
 153   to_write += abbreviate_category(last_heading); \
 154   to_write += "\",\""; \
 155   to_write += translate_web_chars(url_string); \
 156   to_write += "\"\n"; \
 157   output_file.write(to_write); \
 158   _link_count++; \
 159 }
 160
 161 // writes out the current section in the HOOPLE format.
 162 // currently the parent category is set to Root.
 163 #define WRITE_SECTION { \
 164   CLEAN_UP_NAUGHTY(last_heading);  /* clean the name. */ \
 165   /* output a category definition. */ \
 166   astring to_write = "\"C\",\""; \
 167   to_write += last_heading; \
 168   to_write += "\",\""; \
 169   to_write += abbreviate_category(last_parents.top()); \
 170   to_write += "\"\n"; \
 171   output_file.write(to_write); \
 172   _category_count++; \
 173 }
 174
 175 // clears our accumulator strings.
 176 #define RESET_STRINGS { \
 177   url_string = astring::empty_string(); \
 178   name_string = astring::empty_string(); \
 179   intermediate_text = astring::empty_string(); \
 180 }
 181
 182 ////////////////////////////////////////////////////////////////////////////
 183
 184 class link_parser : public application_shell
 185 {
 186 public:
 187   link_parser();
 188   DEFINE_CLASS_NAME("link_parser");
 189   virtual int execute();
 190   int print_instructions(const filename &program_name);
 191
 192 private:
 193   int _link_count;  // number of links.
 194   int _category_count;  // number of categories.
 195
 196   astring url_string;  // the URL we've parsed.
 197   astring name_string;  // the name that we've parsed for the URL.
 198   astring last_heading;  // the last name that was set for a section.
 199   stack<astring> last_parents;  // the history of the parent names.
 200   astring intermediate_text;  // strings we saw before a link.
 201
 202   astring heading_num;
 203     // this string form of a number tracks what kind of heading was started.
 204
 205   astring abbreviate_category(const astring &simplify);
 206     // returns the inner category nickname if the category has one.
 207
 208   astring translate_web_chars(const astring &vervoom);
 209     // translates a few web chars that are safe for csv back into their non-encoded form.
 210 };
 211
 212 ////////////////////////////////////////////////////////////////////////////
 213
 214 link_parser::link_parser()
 215 : application_shell(),
 216   _link_count(0),
 217   _category_count(0),
 218   last_heading("Root"),
 219   last_parents()
 220 {
 221   last_parents.push(last_heading);  // make sure we have at least one level.
 222 }
 223
 224 int link_parser::print_instructions(const filename &program_name)
 225 {
 226   a_sprintf to_show("%s:\n\
 227 This program needs two filenames as command line parameters.  The -i flag\n\
 228 is used to specify the input filename and the -o flag specifies the output\n\
 229 file to be created.  The input file is expected to be an html file\n\
 230 containing links to assorted web sites.  The links are gathered, along with\n\
 231 descriptive text that happens to be near them, to create a link database in\n\
 232 the HOOPLE link format and write it to the output file.  HOOPLE link format\n\
 233 is basically a CSV file that defines the columns 1-4 for describing either\n\
 234 link categories (which support hierarchies) or actual links (i.e., URLs of\n\
 235 interest).  The links are written to a CSV file in the standard HOOPLE link\n\
 236 The HOOPLE link format is documented here:\n\
 237     http://hoople.org/guides/link_database/format_manifesto.txt\n\
 238 ", program_name.basename().raw().s(), program_name.basename().raw().s());
 239   program_wide_logger::get().log(to_show, ALWAYS_PRINT);
 240   return 12;
 241 }
 242
 243 astring link_parser::abbreviate_category(const astring &simplify)
 244 {
 245   astring to_return;
 246   astring name_portion;
 247   bookmark_tree::break_name(simplify, name_portion, to_return);
 248   if (!to_return) return name_portion;
 249   return to_return;
 250 }
 251
 252 astring link_parser::translate_web_chars(const astring &vervoom)
 253 {
 254   astring to_return = vervoom;
 255   to_return.replace_all("&amp;", "&");
 256   to_return.replace_all("%7E", "~");
 257   return to_return;
 258 }
 259
 260 int link_parser::execute()
 261 {
 262   FUNCDEF("main");
 263   command_line cmds(_global_argc, _global_argv);  // process the command line parameters.
 264   astring input_filename;  // we'll store our bookmarks file's name here.
 265   astring output_filename;  // where the processed marks go.
 266   if (!cmds.get_value('i', input_filename, false))
 267     return print_instructions(cmds.program_name());
 268   if (!cmds.get_value('o', output_filename, false))
 269     return print_instructions(cmds.program_name());
 270
 271   BASE_LOG(astring("input file: ") + input_filename);
 272   BASE_LOG(astring("output file: ") + output_filename);
 273
 274   astring full_contents;
 275   byte_filer input_file(input_filename, "r");
 276   if (!input_file.good())
 277     non_continuable_error(class_name(), func, "the input file could not be opened");
 278   input_file.read(full_contents, MAX_FILE_SIZE);
 279   input_file.close();
 280
 281   filename outname(output_filename);
 282   if (outname.exists()) {
 283     non_continuable_error(class_name(), func, astring("the output file ")
 284         + output_filename + " already exists.  It would be over-written if "
 285         "we continued.");
 286   }
 287
 288   byte_filer output_file(output_filename, "w");
 289   if (!output_file.good())
 290     non_continuable_error(class_name(), func, "the output file could not be opened");
 291
 292   enum parsing_states {
 293     // the states below are order dependent; do not change the ordering!
 294     SEEKING_LINK_START,  // looking for the beginning of an html link.
 295     SEEKING_HREF,  // finding the href portion of the link.
 296     GETTING_URL,  // chowing on the URL portion of the link.
 297     SEEKING_NAME,  // finding the closing bracket of the <a ...>.
 298     GETTING_NAME,  // chowing down on characters in the link's name.
 299     SEEKING_CLOSURE,  // looking for the </a> to end the link.
 300     // there is a discontinuity after SEEKING_CLOSURE, but then the following
 301     // states are also order dependent.
 302     SAW_TITLE_START,  // the beginning of a section heading was seen.
 303     GETTING_TITLE,  // grabbing characters in the title.
 304     // new text processing states.
 305     SAW_NESTING_INCREASE,  // a new nesting level has been achieved.
 306     SAW_NESTING_DECREASE,  // we exited from a former nesting level.
 307   };
 308
 309   int curr_index = 0;
 310   parsing_states state = SEEKING_LINK_START;
 311   while (curr_index < full_contents.length()) {
 312     switch (state) {
 313       case SEEKING_LINK_START:
 314         // if we don't see a less-than, then it's not the start of html code,
 315         // so we'll ignore it for now.
 316         if (full_contents[curr_index] != '<') {
 317           ADD_INTERMEDIATE;
 318           INCREM_N_GO;
 319         }
 320         // found a left angle bracket, so now we need to decided where to go next for parsing
 321         // the html coming up.
 322         curr_index++;
 323         // see if this is a heading.  if so, we can snag the heading name.
 324         if (caseless_equals('h', full_contents[curr_index])) {
 325 #ifdef DEBUG_LINK_PARSER
 326           LOG("into the '<h' case");
 327 #endif
 328           // check that we're seeing a heading definition here.
 329           char next = full_contents[curr_index + 1];
 330           if ( (next >= '0') && (next <= '9') ) {
 331             // we found our proper character for starting a heading.  we need
 332             // to jump into that state now.  we'll leave the cursor at the
 333             // beginning of the number.
 334             state = SAW_TITLE_START;
 335             INCREM_N_GO;
 336           }
 337         }
 338         // check if they're telling us a new indentation level of the type we care about.
 339         if (caseless_equals('d', full_contents[curr_index])) {
 340 #ifdef DEBUG_LINK_PARSER
 341           LOG("into the '<d' case");
 342 #endif
 343           // see if they gave us a <dl> tag.
 344           char next = full_contents[curr_index + 1];
 345           if (caseless_equals(next, 'l')) {
 346 #ifdef DEBUG_LINK_PARSER
 347             LOG("into the '<dl' case");
 348 #endif
 349             state = SAW_NESTING_INCREASE;
 350             INCREM_N_GO;
 351           }
 352         }
 353         // see if we can find a close for a nesting level.
 354         if (caseless_equals('/', full_contents[curr_index])) {
 355 #ifdef DEBUG_LINK_PARSER
 356           LOG("into the '</' case");
 357 #endif
 358           // see if they gave us a <dl> tag.
 359           if ( caseless_equals(full_contents[curr_index + 1], 'd')
 360               && caseless_equals(full_contents[curr_index + 2], 'l') ) {
 361 #ifdef DEBUG_LINK_PARSER
 362               LOG("into the '</dl' case");
 363 #endif
 364             state = SAW_NESTING_DECREASE;
 365             INCREM_N_GO;
 366           }
 367         }
 368         // see if it's not a link, and abandon ship if it's not, since that's the last option
 369         // for html code that we parse.
 370         if (!caseless_equals('a', full_contents[curr_index])) {
 371 #ifdef DEBUG_LINK_PARSER
 372           LOG("into the not an '<a' case");
 373 #endif
 374           intermediate_text += '<';
 375           JUMP_TO_CHAR('>', true);
 376           continue;
 377         }
 378 #ifdef DEBUG_LINK_PARSER
 379         LOG("into the final case, the '<a' case");
 380 #endif
 381         // found an a, but make sure that's the only character in the word.
 382         curr_index++;
 383         if (!parser_bits::white_space(full_contents[curr_index])) {
 384           intermediate_text += "<a";
 385           JUMP_TO_CHAR('>', true);
 386           continue;
 387         }
 388         // this looks like an address so find the start of the href.
 389         NEXT_STATE_INCREM;
 390         break;
 391       case SAW_NESTING_INCREASE:
 392         last_parents.push(last_heading);
 393 #ifdef DEBUG_LINK_PARSER
 394         LOG(a_sprintf("nesting inwards, new depth %d", last_parents.size()));
 395 #endif
 396         JUMP_TO_CHAR('>', false);
 397         state = SEEKING_LINK_START;
 398         break;
 399       case SAW_NESTING_DECREASE:
 400         last_parents.pop();
 401 #ifdef DEBUG_LINK_PARSER
 402         LOG(a_sprintf("nesting outwards, new depth %d", last_parents.size()));
 403 #endif
 404         JUMP_TO_CHAR('>', false);
 405         state = SEEKING_LINK_START;
 406         break;
 407       case SEEKING_HREF:
 408         JUMP_TO_CHAR('h', false);  // find the next 'h' for "href".
 409         curr_index++;
 410         if (!caseless_equals('r', full_contents[curr_index])) continue;
 411         curr_index++;
 412         if (!caseless_equals('e', full_contents[curr_index])) continue;
 413         curr_index++;
 414         if (!caseless_equals('f', full_contents[curr_index])) continue;
 415         curr_index++;
 416         if (full_contents[curr_index] != '=') continue;
 417         curr_index++;
 418         if (full_contents[curr_index] != '"') continue;
 419         // whew, got through the word href and the assignment.  the rest
 420         // should all be part of the link.
 421         NEXT_STATE_INCREM;
 422         break;
 423       case GETTING_URL:
 424         // as long as we don't see the closure of the quoted string for the
 425         // href, then we can keep accumulating characters from it.
 426         if (full_contents[curr_index] == '"') NEXT_STATE_INCREM;
 427         url_string += full_contents[curr_index];
 428         INCREM_N_GO;  // keep chewing on it in this same state.
 429         break;
 430       case SEEKING_NAME:
 431         JUMP_TO_CHAR('>', false);  // find closing bracket.
 432         NEXT_STATE_INCREM;  // now start grabbing the name characters.
 433         break;
 434       case GETTING_NAME:
 435         // we have to stop grabbing name characters when we spy a new code
 436         // being started.
 437         if (full_contents[curr_index] == '<') {
 438           // if we see a closing command, then we assume it's the one we want.
 439           if (full_contents[curr_index + 1] == '/')
 440             NEXT_STATE_INCREM;
 441           // if we see html inside the name, we just throw it out.
 442           JUMP_TO_CHAR('>', false);
 443           curr_index++;
 444           continue;
 445         }
 446         name_string += full_contents[curr_index];
 447         INCREM_N_GO;  // keep chewing on it in this same state.
 448         break;
 449       case SEEKING_CLOSURE:
 450         JUMP_TO_CHAR('>', false);  // find the closure of the html code.
 451         // write the link out now.
 452         WRITE_LINK;
 453         // clean out our accumulated strings.
 454         RESET_STRINGS;
 455         state = SEEKING_LINK_START;
 456         INCREM_N_GO;
 457         break;
 458       case SAW_TITLE_START:
 459         heading_num = full_contents.substring(curr_index, curr_index);
 460         JUMP_TO_CHAR('>', false);
 461         NEXT_STATE_INCREM;  // start eating the name.
 462         break;
 463       case GETTING_TITLE: {
 464         int indy = full_contents.find('<', curr_index);
 465         if (negative(indy)) {
 466           state = SEEKING_LINK_START;  // too weird, go back to start.
 467           continue;
 468         }
 469         // push the last title if it differs from the top of the stack.
 470         last_heading = full_contents.substring(curr_index, indy - 1);
 471         WRITE_SECTION;
 472         JUMP_TO_CHAR('<', false);  // now find the start of the header closure.
 473         JUMP_TO_CHAR('>', false);  // now find the end of the header closure.
 474         RESET_STRINGS;
 475         state = SEEKING_LINK_START;  // successfully found section name.
 476         break;
 477       }
 478       default:
 479         non_continuable_error(class_name(), func, "entered erroneous state!");
 480     }
 481   }
 482
 483   if (url_string.t()) WRITE_LINK;
 484
 485   output_file.close();
 486
 487   BASE_LOG(a_sprintf("wrote %d links in %d categories.", _link_count,
 488       _category_count));
 489
 490   return 0;
 491 }
 492
 493 ////////////////////////////////////////////////////////////////////////////
 494
 495 HOOPLE_MAIN(link_parser, )
 496