1 /*****************************************************************************\
4 * Author : Chris Koeritz *
8 * Processes html files and finds the links. A database in the HOOPLE *
9 * link format is created from the links found. *
11 *******************************************************************************
12 * Copyright (c) 1991-$now By Author. This program is free software; you can *
13 * redistribute it and/or modify it under the terms of the GNU General Public *
14 * License as published by the Free Software Foundation; either version 2 of *
15 * the License or (at your option) any later version. This is online at: *
16 * http://www.fsf.org/copyleft/gpl.html *
17 * Please send any updates to: fred@gruntose.com *
18 \*****************************************************************************/
22 // the standard link structure in html is similar to this:
23 // <a href="blahblah">Link Name and Launching Point</a>
25 // the standard we adopt for section titles is that it must be a heading
26 // marker. that formatting looks like this, for example:
27 // <h3 assorted_stuff>The Section Title:</h3>
29 #include "bookmark_tree.h"
31 #include <application/hoople_main.h>
32 #include <basis/astring.h>
33 #include <basis/functions.h>
34 #include <basis/guards.h>
35 #include <filesystem/byte_filer.h>
36 #include <filesystem/filename.h>
37 #include <loggers/critical_events.h>
38 #include <loggers/file_logger.h>
39 #include <structures/stack.h>
40 #include <structures/static_memory_gremlin.h>
41 #include <textual/parser_bits.h>
43 using namespace application;
44 using namespace basis;
45 using namespace filesystem;
46 using namespace loggers;
47 using namespace structures;
48 using namespace textual;
51 #define BASE_LOG(s) program_wide_logger::get().log(s, ALWAYS_PRINT)
53 #define LOG(s) CLASS_EMERGENCY_LOG(program_wide_logger::get(), s)
55 //#define DEBUG_LINK_PARSER
56 // uncomment for noisier run to seek problems.
58 ////////////////////////////////////////////////////////////////////////////
60 const int MAX_FILE_SIZE = 4 * MEGABYTE;
61 // this is the largest html file size we will process.
63 ////////////////////////////////////////////////////////////////////////////
65 // a macro that increments the position in the string and restarts the loop.
66 #define INCREM_N_GO { curr_index++; continue; }
68 // puts the current character on the intermediate string.
69 #define ADD_INTERMEDIATE \
70 intermediate_text += full_contents[curr_index]
72 // returns a character in lower-case, if 'a' is in upper case.
73 char normalize_char(char a)
75 if ( (a >= 'A') && (a <= 'Z') ) return a + 'a' - 'A';
79 // returns true if the two characters are the same, ignoring upper/lower case.
80 bool caseless_equals(char a, char b) { return normalize_char(a) == normalize_char(b); }
82 // a macro that skips all characters until the specified one is seen.
83 #define JUMP_TO_CHAR(to_find, save_them) { \
84 while ( (curr_index < full_contents.length()) \
85 && !caseless_equals(to_find, full_contents[curr_index]) ) { \
86 if (save_them) ADD_INTERMEDIATE; \
91 // increments the state, the current character and restarts the loop.
92 #define NEXT_STATE_INCREM { \
93 state = parsing_states(state+1); /* move forward in states. */ \
98 // cleans out the disallowed characters in the string provided.
99 #define CLEAN_UP_NAUGHTY(s) { \
100 while (s.replace("\n", " ")) {} \
101 while (s.replace("\r", "")) {} \
105 //was before the strip_spaces code above.
107 int indy = s.find("--"); \
108 while (non_negative(indy)) { \
109 s[indy] = ' '; / * replace the first dash with a space. * / \
110 for (int i = indy + 1; i < s.length(); i++) { \
111 if (s[i] != '-') break; \
115 indy = s.find("--"); \
117 while (s.replace(" ", " ")) {} \
120 // cleans up underscores in areas that are supposed to be english.
121 #define MAKE_MORE_ENGLISH(s) \
122 s.replace_all('_', ' ')
124 void strain_out_html_codes(astring &to_edit)
126 for (int i = 0; i < to_edit.length(); i++) {
127 if (to_edit[i] != '<') continue;
128 // found a left bracket.
129 int indy = to_edit.find('>', i);
130 if (negative(indy)) return; // bail out, unexpected unmatched bracket.
131 to_edit.zap(i, indy);
132 i--; // skip back to reconsider current place.
136 // writes out the currently accumulated link info.
137 #define WRITE_LINK { \
138 /* clean naughty characters out of the names. */ \
139 CLEAN_UP_NAUGHTY(url_string); \
140 CLEAN_UP_NAUGHTY(name_string); \
141 if (url_string.ends(name_string)) { \
142 /* handle the name being boring. replace with the intermediate text. */ \
143 MAKE_MORE_ENGLISH(intermediate_text); \
144 strain_out_html_codes(intermediate_text); \
145 CLEAN_UP_NAUGHTY(intermediate_text); \
146 if (intermediate_text.length()) \
147 name_string = intermediate_text; \
149 /* output a link in the HOOPLE format. */ \
150 astring to_write = "\"L\",\""; \
151 to_write += translate_web_chars(name_string); \
152 to_write += "\",\""; \
153 to_write += abbreviate_category(last_heading); \
154 to_write += "\",\""; \
155 to_write += translate_web_chars(url_string); \
156 to_write += "\"\n"; \
157 output_file.write(to_write); \
161 // writes out the current section in the HOOPLE format.
162 // currently the parent category is set to Root.
163 #define WRITE_SECTION { \
164 CLEAN_UP_NAUGHTY(last_heading); /* clean the name. */ \
165 /* output a category definition. */ \
166 astring to_write = "\"C\",\""; \
167 to_write += last_heading; \
168 to_write += "\",\""; \
169 to_write += abbreviate_category(last_parents.top()); \
170 to_write += "\"\n"; \
171 output_file.write(to_write); \
175 // clears our accumulator strings.
176 #define RESET_STRINGS { \
177 url_string = astring::empty_string(); \
178 name_string = astring::empty_string(); \
179 intermediate_text = astring::empty_string(); \
182 ////////////////////////////////////////////////////////////////////////////
184 class link_parser : public application_shell
188 DEFINE_CLASS_NAME("link_parser");
189 virtual int execute();
190 int print_instructions(const filename &program_name);
193 int _link_count; // number of links.
194 int _category_count; // number of categories.
196 astring url_string; // the URL we've parsed.
197 astring name_string; // the name that we've parsed for the URL.
198 astring last_heading; // the last name that was set for a section.
199 stack<astring> last_parents; // the history of the parent names.
200 astring intermediate_text; // strings we saw before a link.
203 // this string form of a number tracks what kind of heading was started.
205 astring abbreviate_category(const astring &simplify);
206 // returns the inner category nickname if the category has one.
208 astring translate_web_chars(const astring &vervoom);
209 // translates a few web chars that are safe for csv back into their non-encoded form.
212 ////////////////////////////////////////////////////////////////////////////
214 link_parser::link_parser()
215 : application_shell(),
218 last_heading("Root"),
221 last_parents.push(last_heading); // make sure we have at least one level.
224 int link_parser::print_instructions(const filename &program_name)
226 a_sprintf to_show("%s:\n\
227 This program needs two filenames as command line parameters. The -i flag\n\
228 is used to specify the input filename and the -o flag specifies the output\n\
229 file to be created. The input file is expected to be an html file\n\
230 containing links to assorted web sites. The links are gathered, along with\n\
231 descriptive text that happens to be near them, to create a link database in\n\
232 the HOOPLE link format and write it to the output file. HOOPLE link format\n\
233 is basically a CSV file that defines the columns 1-4 for describing either\n\
234 link categories (which support hierarchies) or actual links (i.e., URLs of\n\
235 interest). The links are written to a CSV file in the standard HOOPLE link\n\
236 The HOOPLE link format is documented here:\n\
237 http://hoople.org/guides/link_database/format_manifesto.txt\n\
238 ", program_name.basename().raw().s(), program_name.basename().raw().s());
239 program_wide_logger::get().log(to_show, ALWAYS_PRINT);
243 astring link_parser::abbreviate_category(const astring &simplify)
246 astring name_portion;
247 bookmark_tree::break_name(simplify, name_portion, to_return);
248 if (!to_return) return name_portion;
252 astring link_parser::translate_web_chars(const astring &vervoom)
254 astring to_return = vervoom;
255 to_return.replace_all("&", "&");
256 to_return.replace_all("%7E", "~");
260 int link_parser::execute()
263 command_line cmds(_global_argc, _global_argv); // process the command line parameters.
264 astring input_filename; // we'll store our bookmarks file's name here.
265 astring output_filename; // where the processed marks go.
266 if (!cmds.get_value('i', input_filename, false))
267 return print_instructions(cmds.program_name());
268 if (!cmds.get_value('o', output_filename, false))
269 return print_instructions(cmds.program_name());
271 BASE_LOG(astring("input file: ") + input_filename);
272 BASE_LOG(astring("output file: ") + output_filename);
274 astring full_contents;
275 byte_filer input_file(input_filename, "r");
276 if (!input_file.good())
277 non_continuable_error(class_name(), func, "the input file could not be opened");
278 input_file.read(full_contents, MAX_FILE_SIZE);
281 filename outname(output_filename);
282 if (outname.exists()) {
283 non_continuable_error(class_name(), func, astring("the output file ")
284 + output_filename + " already exists. It would be over-written if "
288 byte_filer output_file(output_filename, "w");
289 if (!output_file.good())
290 non_continuable_error(class_name(), func, "the output file could not be opened");
292 enum parsing_states {
293 // the states below are order dependent; do not change the ordering!
294 SEEKING_LINK_START, // looking for the beginning of an html link.
295 SEEKING_HREF, // finding the href portion of the link.
296 GETTING_URL, // chowing on the URL portion of the link.
297 SEEKING_NAME, // finding the closing bracket of the <a ...>.
298 GETTING_NAME, // chowing down on characters in the link's name.
299 SEEKING_CLOSURE, // looking for the </a> to end the link.
300 // there is a discontinuity after SEEKING_CLOSURE, but then the following
301 // states are also order dependent.
302 SAW_TITLE_START, // the beginning of a section heading was seen.
303 GETTING_TITLE, // grabbing characters in the title.
304 // new text processing states.
305 SAW_NESTING_INCREASE, // a new nesting level has been achieved.
306 SAW_NESTING_DECREASE, // we exited from a former nesting level.
310 parsing_states state = SEEKING_LINK_START;
311 while (curr_index < full_contents.length()) {
313 case SEEKING_LINK_START:
314 // if we don't see a less-than, then it's not the start of html code,
315 // so we'll ignore it for now.
316 if (full_contents[curr_index] != '<') {
320 // found a left angle bracket, so now we need to decided where to go next for parsing
321 // the html coming up.
323 // see if this is a heading. if so, we can snag the heading name.
324 if (caseless_equals('h', full_contents[curr_index])) {
325 #ifdef DEBUG_LINK_PARSER
326 LOG("into the '<h' case");
328 // check that we're seeing a heading definition here.
329 char next = full_contents[curr_index + 1];
330 if ( (next >= '0') && (next <= '9') ) {
331 // we found our proper character for starting a heading. we need
332 // to jump into that state now. we'll leave the cursor at the
333 // beginning of the number.
334 state = SAW_TITLE_START;
338 // check if they're telling us a new indentation level of the type we care about.
339 if (caseless_equals('d', full_contents[curr_index])) {
340 #ifdef DEBUG_LINK_PARSER
341 LOG("into the '<d' case");
343 // see if they gave us a <dl> tag.
344 char next = full_contents[curr_index + 1];
345 if (caseless_equals(next, 'l')) {
346 #ifdef DEBUG_LINK_PARSER
347 LOG("into the '<dl' case");
349 state = SAW_NESTING_INCREASE;
353 // see if we can find a close for a nesting level.
354 if (caseless_equals('/', full_contents[curr_index])) {
355 #ifdef DEBUG_LINK_PARSER
356 LOG("into the '</' case");
358 // see if they gave us a <dl> tag.
359 if ( caseless_equals(full_contents[curr_index + 1], 'd')
360 && caseless_equals(full_contents[curr_index + 2], 'l') ) {
361 #ifdef DEBUG_LINK_PARSER
362 LOG("into the '</dl' case");
364 state = SAW_NESTING_DECREASE;
368 // see if it's not a link, and abandon ship if it's not, since that's the last option
369 // for html code that we parse.
370 if (!caseless_equals('a', full_contents[curr_index])) {
371 #ifdef DEBUG_LINK_PARSER
372 LOG("into the not an '<a' case");
374 intermediate_text += '<';
375 JUMP_TO_CHAR('>', true);
378 #ifdef DEBUG_LINK_PARSER
379 LOG("into the final case, the '<a' case");
381 // found an a, but make sure that's the only character in the word.
383 if (!parser_bits::white_space(full_contents[curr_index])) {
384 intermediate_text += "<a";
385 JUMP_TO_CHAR('>', true);
388 // this looks like an address so find the start of the href.
391 case SAW_NESTING_INCREASE:
392 last_parents.push(last_heading);
393 #ifdef DEBUG_LINK_PARSER
394 LOG(a_sprintf("nesting inwards, new depth %d", last_parents.size()));
396 JUMP_TO_CHAR('>', false);
397 state = SEEKING_LINK_START;
399 case SAW_NESTING_DECREASE:
401 #ifdef DEBUG_LINK_PARSER
402 LOG(a_sprintf("nesting outwards, new depth %d", last_parents.size()));
404 JUMP_TO_CHAR('>', false);
405 state = SEEKING_LINK_START;
408 JUMP_TO_CHAR('h', false); // find the next 'h' for "href".
410 if (!caseless_equals('r', full_contents[curr_index])) continue;
412 if (!caseless_equals('e', full_contents[curr_index])) continue;
414 if (!caseless_equals('f', full_contents[curr_index])) continue;
416 if (full_contents[curr_index] != '=') continue;
418 if (full_contents[curr_index] != '"') continue;
419 // whew, got through the word href and the assignment. the rest
420 // should all be part of the link.
424 // as long as we don't see the closure of the quoted string for the
425 // href, then we can keep accumulating characters from it.
426 if (full_contents[curr_index] == '"') NEXT_STATE_INCREM;
427 url_string += full_contents[curr_index];
428 INCREM_N_GO; // keep chewing on it in this same state.
431 JUMP_TO_CHAR('>', false); // find closing bracket.
432 NEXT_STATE_INCREM; // now start grabbing the name characters.
435 // we have to stop grabbing name characters when we spy a new code
437 if (full_contents[curr_index] == '<') {
438 // if we see a closing command, then we assume it's the one we want.
439 if (full_contents[curr_index + 1] == '/')
441 // if we see html inside the name, we just throw it out.
442 JUMP_TO_CHAR('>', false);
446 name_string += full_contents[curr_index];
447 INCREM_N_GO; // keep chewing on it in this same state.
449 case SEEKING_CLOSURE:
450 JUMP_TO_CHAR('>', false); // find the closure of the html code.
451 // write the link out now.
453 // clean out our accumulated strings.
455 state = SEEKING_LINK_START;
458 case SAW_TITLE_START:
459 heading_num = full_contents.substring(curr_index, curr_index);
460 JUMP_TO_CHAR('>', false);
461 NEXT_STATE_INCREM; // start eating the name.
463 case GETTING_TITLE: {
464 int indy = full_contents.find('<', curr_index);
465 if (negative(indy)) {
466 state = SEEKING_LINK_START; // too weird, go back to start.
469 // push the last title if it differs from the top of the stack.
470 last_heading = full_contents.substring(curr_index, indy - 1);
472 JUMP_TO_CHAR('<', false); // now find the start of the header closure.
473 JUMP_TO_CHAR('>', false); // now find the end of the header closure.
475 state = SEEKING_LINK_START; // successfully found section name.
479 non_continuable_error(class_name(), func, "entered erroneous state!");
483 if (url_string.t()) WRITE_LINK;
487 BASE_LOG(a_sprintf("wrote %d links in %d categories.", _link_count,
493 ////////////////////////////////////////////////////////////////////////////
495 HOOPLE_MAIN(link_parser, )