1 /*****************************************************************************\
4 * Author : Chris Koeritz *
8 * Processes html files and finds the links. A database in the HOOPLE *
9 * link format is created from the links found. *
11 *******************************************************************************
12 * Copyright (c) 1991-$now By Author. This program is free software; you can *
13 * redistribute it and/or modify it under the terms of the GNU General Public *
14 * License as published by the Free Software Foundation; either version 2 of *
15 * the License or (at your option) any later version. This is online at: *
16 * http://www.fsf.org/copyleft/gpl.html *
17 * Please send any updates to: fred@gruntose.com *
18 \*****************************************************************************/
22 // the standard link structure in html is similar to this:
23 // <a href="blahblah">Link Name and Launching Point</a>
25 // the standard we adopt for section titles is that it must be a heading
26 // marker. that formatting looks like this, for example:
27 // <h3 assorted_stuff>The Section Title:</h3>
29 #include "bookmark_tree.h"
31 #include <application/hoople_main.h>
32 #include <basis/astring.h>
33 #include <basis/functions.h>
34 #include <basis/guards.h>
35 #include <filesystem/byte_filer.h>
36 #include <filesystem/filename.h>
37 #include <loggers/critical_events.h>
38 #include <loggers/file_logger.h>
39 #include <structures/stack.h>
40 #include <structures/static_memory_gremlin.h>
41 #include <textual/parser_bits.h>
43 using namespace application;
44 using namespace basis;
45 using namespace filesystem;
46 using namespace loggers;
47 using namespace structures;
48 using namespace textual;
51 #define BASE_LOG(s) program_wide_logger::get().log(s, ALWAYS_PRINT)
53 #define LOG(s) CLASS_EMERGENCY_LOG(program_wide_logger::get(), s)
55 //#define DEBUG_LINK_PARSER
56 // uncomment for noisier run to seek problems.
58 ////////////////////////////////////////////////////////////////////////////
60 const int MAX_FILE_SIZE = 4 * MEGABYTE;
61 // this is the largest html file size we will process.
63 ////////////////////////////////////////////////////////////////////////////
65 // a macro that increments the position in the string and restarts the loop.
66 #define INCREM_N_GO { curr_index++; continue; }
68 // puts the current character on the intermediate string.
69 #define ADD_INTERMEDIATE { \
70 char add_in = full_contents[curr_index]; \
71 if ( (add_in == '<') || (add_in == '>') ) { \
74 intermediate_text += add_in; \
77 // returns a character in lower-case, if 'a' is in upper case.
78 char normalize_char(char a)
80 if ( (a >= 'A') && (a <= 'Z') ) return a + 'a' - 'A';
84 // returns true if the two characters are the same, ignoring upper/lower case.
85 bool caseless_equals(char a, char b) { return normalize_char(a) == normalize_char(b); }
87 // a macro that skips all characters until the specified one is seen.
88 #define JUMP_TO_CHAR(to_find, save_them) { \
89 while ( (curr_index < full_contents.length()) \
90 && !caseless_equals(to_find, full_contents[curr_index]) ) { \
91 if (save_them) ADD_INTERMEDIATE; \
96 // increments the state, the current character and restarts the loop.
97 #define NEXT_STATE_INCREM { \
98 state = parsing_states(state+1); /* move forward in states. */ \
103 // cleans out the disallowed characters in the string provided.
104 #define CLEAN_UP_NAUGHTY(s) { \
105 while (s.replace("\n", " ")) {} \
106 while (s.replace("\r", "")) {} \
110 //was before the strip_spaces code above.
112 int indy = s.find("--"); \
113 while (non_negative(indy)) { \
114 s[indy] = ' '; / * replace the first dash with a space. * / \
115 for (int i = indy + 1; i < s.length(); i++) { \
116 if (s[i] != '-') break; \
120 indy = s.find("--"); \
122 while (s.replace(" ", " ")) {} \
125 // cleans up underscores in areas that are supposed to be english.
126 #define MAKE_MORE_ENGLISH(s) \
127 s.replace_all('_', ' ')
129 void strain_out_html_codes(astring &to_edit)
131 for (int i = 0; i < to_edit.length(); i++) {
132 if (to_edit[i] != '<') continue;
133 // found a left bracket.
134 int indy = to_edit.find('>', i);
135 if (negative(indy)) return; // bail out, unexpected unmatched bracket.
136 to_edit.zap(i, indy);
137 i--; // skip back to reconsider current place.
141 // writes out the currently accumulated link info.
142 #define WRITE_LINK { \
143 /* clean naughty characters out of the names. */ \
144 CLEAN_UP_NAUGHTY(url_string); \
145 CLEAN_UP_NAUGHTY(name_string); \
146 /* output a link in the HOOPLE format. */ \
147 astring to_write = "\"L\",\""; \
148 to_write += translate_web_chars(name_string); \
149 to_write += "\",\""; \
150 to_write += abbreviate_category(last_heading); \
151 to_write += "\",\""; \
152 to_write += translate_web_chars(url_string); \
153 to_write += "\"\n"; \
154 output_file.write(to_write); \
157 //was after second clean up naughty
158 /*argh yuck... if (url_string.ends(name_string)) { \
159 / * handle the name being boring. replace with the intermediate text. * / \
160 MAKE_MORE_ENGLISH(intermediate_text); \
161 strain_out_html_codes(intermediate_text); \
162 CLEAN_UP_NAUGHTY(intermediate_text); \
163 if (intermediate_text.length()) \
164 name_string = intermediate_text; \
168 // writes out the current section in the HOOPLE format.
169 // currently the parent category is set to Root.
170 #define WRITE_SECTION { \
171 CLEAN_UP_NAUGHTY(last_heading); /* clean the name. */ \
172 /* output a category definition. */ \
173 astring to_write = "\"C\",\""; \
174 to_write += translate_web_chars(last_heading); \
175 to_write += "\",\""; \
176 to_write += abbreviate_category(last_parents.top()); \
177 to_write += "\"\n"; \
178 output_file.write(to_write); \
182 // clears our accumulator strings.
183 #define RESET_STRINGS { \
184 url_string = astring::empty_string(); \
185 name_string = astring::empty_string(); \
186 intermediate_text = astring::empty_string(); \
189 ////////////////////////////////////////////////////////////////////////////
191 class link_parser : public application_shell
195 DEFINE_CLASS_NAME("link_parser");
196 virtual int execute();
197 int print_instructions(const filename &program_name);
200 int _link_count; // number of links.
201 int _category_count; // number of categories.
203 astring url_string; // the URL we've parsed.
204 astring name_string; // the name that we've parsed for the URL.
205 astring last_heading; // the last name that was set for a section.
206 stack<astring> last_parents; // the history of the parent names.
207 astring intermediate_text; // strings we saw before a link.
210 // this string form of a number tracks what kind of heading was started.
212 astring abbreviate_category(const astring &simplify);
213 // returns the inner category nickname if the category has one.
215 astring translate_web_chars(const astring &vervoom);
216 // translates a few web chars that are safe for csv back into their non-encoded form.
219 ////////////////////////////////////////////////////////////////////////////
221 link_parser::link_parser()
222 : application_shell(),
225 last_heading("Root"),
228 last_parents.push(last_heading); // make sure we have at least one level.
231 int link_parser::print_instructions(const filename &program_name)
233 a_sprintf to_show("%s:\n\
234 This program needs two filenames as command line parameters. The -i flag\n\
235 is used to specify the input filename and the -o flag specifies the output\n\
236 file to be created. The input file is expected to be an html file\n\
237 containing links to assorted web sites. The links are gathered, along with\n\
238 descriptive text that happens to be near them, to create a link database in\n\
239 the HOOPLE link format and write it to the output file. HOOPLE link format\n\
240 is basically a CSV file that defines the columns 1-4 for describing either\n\
241 link categories (which support hierarchies) or actual links (i.e., URLs of\n\
242 interest). The links are written to a CSV file in the standard HOOPLE link\n\
243 The HOOPLE link format is documented here:\n\
244 http://feistymeow.org/guides/link_database/format_manifesto.txt\n\
245 ", program_name.basename().raw().s(), program_name.basename().raw().s());
246 program_wide_logger::get().log(to_show, ALWAYS_PRINT);
250 astring link_parser::abbreviate_category(const astring &simplify)
253 astring name_portion;
254 bookmark_tree::break_name(simplify, name_portion, to_return);
255 if (!to_return) return name_portion;
259 astring link_parser::translate_web_chars(const astring &vervoom)
261 astring to_return = vervoom;
262 to_return.replace_all("&", "&");
263 to_return.replace_all("ä", "ä");
264 to_return.replace_all("©", "(c)");
265 to_return.replace_all("é", "é");
266 to_return.replace_all("«", "--");
267 to_return.replace_all("‘", "'");
268 to_return.replace_all("“", "'");
269 to_return.replace_all("—", "--");
270 to_return.replace_all("–", "--");
271 to_return.replace_all(" ", " ");
272 to_return.replace_all("»", "--");
273 to_return.replace_all("”", "'");
274 to_return.replace_all("’", "'");
276 to_return.replace_all("%7E", "~");
277 to_return.replace_all("%28", "(");
278 to_return.replace_all("%29", ")");
282 int link_parser::execute()
285 command_line cmds(_global_argc, _global_argv); // process the command line parameters.
286 astring input_filename; // we'll store our bookmarks file's name here.
287 astring output_filename; // where the processed marks go.
288 if (!cmds.get_value('i', input_filename, false))
289 return print_instructions(cmds.program_name());
290 if (!cmds.get_value('o', output_filename, false))
291 return print_instructions(cmds.program_name());
293 BASE_LOG(astring("input file: ") + input_filename);
294 BASE_LOG(astring("output file: ") + output_filename);
296 astring full_contents;
297 byte_filer input_file(input_filename, "r");
298 if (!input_file.good())
299 non_continuable_error(class_name(), func, "the input file could not be opened");
300 input_file.read(full_contents, MAX_FILE_SIZE);
303 filename outname(output_filename);
304 if (outname.exists()) {
305 non_continuable_error(class_name(), func, astring("the output file ")
306 + output_filename + " already exists. It would be over-written if "
310 byte_filer output_file(output_filename, "w");
311 if (!output_file.good())
312 non_continuable_error(class_name(), func, "the output file could not be opened");
314 enum parsing_states {
315 // the states below are order dependent; do not change the ordering!
316 SEEKING_LINK_START, // looking for the beginning of an html link.
317 SEEKING_HREF, // finding the href portion of the link.
318 GETTING_URL, // chowing on the URL portion of the link.
319 SEEKING_NAME, // finding the closing bracket of the <a ...>.
320 GETTING_NAME, // chowing down on characters in the link's name.
321 SEEKING_CLOSURE, // looking for the </a> to end the link.
322 // there is a discontinuity after SEEKING_CLOSURE, but then the following
323 // states are also order dependent.
324 SAW_TITLE_START, // the beginning of a section heading was seen.
325 GETTING_TITLE, // grabbing characters in the title.
326 // new text processing states.
327 SAW_NESTING_INCREASE, // a new nesting level has been achieved.
328 SAW_NESTING_DECREASE, // we exited from a former nesting level.
332 parsing_states state = SEEKING_LINK_START;
333 while (curr_index < full_contents.length()) {
335 case SEEKING_LINK_START:
336 // if we don't see a less-than, then it's not the start of html code,
337 // so we'll ignore it for now.
338 if (full_contents[curr_index] != '<') {
342 // found a left angle bracket, so now we need to decided where to go next for parsing
343 // the html coming up.
345 // see if this is a heading. if so, we can snag the heading name.
346 if (caseless_equals('h', full_contents[curr_index])) {
347 #ifdef DEBUG_LINK_PARSER
348 LOG("into the '<h' case");
350 // check that we're seeing a heading definition here.
351 char next = full_contents[curr_index + 1];
352 if ( (next >= '0') && (next <= '9') ) {
353 // we found our proper character for starting a heading. we need
354 // to jump into that state now. we'll leave the cursor at the
355 // beginning of the number.
356 state = SAW_TITLE_START;
360 // check if they're telling us a new indentation level of the type we care about.
361 if (caseless_equals('d', full_contents[curr_index])) {
362 #ifdef DEBUG_LINK_PARSER
363 LOG("into the '<d' case");
365 // see if they gave us a <dl> tag.
366 char next = full_contents[curr_index + 1];
367 if (caseless_equals(next, 'l')) {
368 #ifdef DEBUG_LINK_PARSER
369 LOG("into the '<dl' case");
371 state = SAW_NESTING_INCREASE;
375 // see if we can find a close for a nesting level.
376 if (caseless_equals('/', full_contents[curr_index])) {
377 #ifdef DEBUG_LINK_PARSER
378 LOG("into the '</' case");
380 // see if they gave us a <dl> tag.
381 if ( caseless_equals(full_contents[curr_index + 1], 'd')
382 && caseless_equals(full_contents[curr_index + 2], 'l') ) {
383 #ifdef DEBUG_LINK_PARSER
384 LOG("into the '</dl' case");
386 state = SAW_NESTING_DECREASE;
390 // see if it's not a link, and abandon ship if it's not, since that's the last option
391 // for html code that we parse.
392 if (!caseless_equals('a', full_contents[curr_index])) {
393 #ifdef DEBUG_LINK_PARSER
394 LOG("into the not an '<a' case");
396 // intermediate_text += '<';
397 JUMP_TO_CHAR('>', false);
400 #ifdef DEBUG_LINK_PARSER
401 LOG("into the final case, the '<a' case");
403 // found an a, but make sure that's the only character in the word.
405 if (!parser_bits::white_space(full_contents[curr_index])) {
406 // intermediate_text += "<a";
407 JUMP_TO_CHAR('>', false);
410 // this looks like an address so find the start of the href.
413 case SAW_NESTING_INCREASE:
414 last_parents.push(last_heading);
415 #ifdef DEBUG_LINK_PARSER
416 LOG(a_sprintf("nesting inwards, new depth %d", last_parents.size()));
418 JUMP_TO_CHAR('>', false);
419 state = SEEKING_LINK_START;
421 case SAW_NESTING_DECREASE:
423 #ifdef DEBUG_LINK_PARSER
424 LOG(a_sprintf("nesting outwards, new depth %d", last_parents.size()));
426 JUMP_TO_CHAR('>', false);
427 state = SEEKING_LINK_START;
430 JUMP_TO_CHAR('h', false); // find the next 'h' for "href".
432 if (!caseless_equals('r', full_contents[curr_index])) continue;
434 if (!caseless_equals('e', full_contents[curr_index])) continue;
436 if (!caseless_equals('f', full_contents[curr_index])) continue;
438 if (full_contents[curr_index] != '=') continue;
440 if (full_contents[curr_index] != '"') continue;
441 // whew, got through the word href and the assignment. the rest
442 // should all be part of the link.
446 // as long as we don't see the closure of the quoted string for the
447 // href, then we can keep accumulating characters from it.
448 if (full_contents[curr_index] == '"') NEXT_STATE_INCREM;
449 url_string += full_contents[curr_index];
450 INCREM_N_GO; // keep chewing on it in this same state.
453 JUMP_TO_CHAR('>', false); // find closing bracket.
454 NEXT_STATE_INCREM; // now start grabbing the name characters.
457 // we have to stop grabbing name characters when we spy a new code
459 if (full_contents[curr_index] == '<') {
460 // if we see a closing command, then we assume it's the one we want.
461 if (full_contents[curr_index + 1] == '/')
463 // if we see html inside the name, we just throw it out.
464 JUMP_TO_CHAR('>', false);
468 name_string += full_contents[curr_index];
469 INCREM_N_GO; // keep chewing on it in this same state.
471 case SEEKING_CLOSURE:
472 JUMP_TO_CHAR('>', false); // find the closure of the html code.
473 // write the link out now.
475 // clean out our accumulated strings.
477 state = SEEKING_LINK_START;
480 case SAW_TITLE_START:
481 heading_num = full_contents.substring(curr_index, curr_index);
482 JUMP_TO_CHAR('>', false);
483 NEXT_STATE_INCREM; // start eating the name.
485 case GETTING_TITLE: {
486 int indy = full_contents.find('<', curr_index);
487 if (negative(indy)) {
488 state = SEEKING_LINK_START; // too weird, go back to start.
491 // push the last title if it differs from the top of the stack.
492 last_heading = full_contents.substring(curr_index, indy - 1);
494 JUMP_TO_CHAR('<', false); // now find the start of the header closure.
495 JUMP_TO_CHAR('>', false); // now find the end of the header closure.
497 state = SEEKING_LINK_START; // successfully found section name.
501 non_continuable_error(class_name(), func, "entered erroneous state!");
505 if (url_string.t()) WRITE_LINK;
509 BASE_LOG(a_sprintf("wrote %d links in %d categories.", _link_count,
515 ////////////////////////////////////////////////////////////////////////////
517 HOOPLE_MAIN(link_parser, )