feisty meow concerns codebase 2.140
link_parser.cpp
Go to the documentation of this file.
1/*****************************************************************************\
2* *
3* Name : link_parser *
4* Author : Chris Koeritz *
5* *
6* Purpose: *
7* *
8* Processes html files and finds the links. A database in the HOOPLE *
9* link format is created from the links found. *
10* *
11*******************************************************************************
12* Copyright (c) 1991-$now By Author. This program is free software; you can *
13* redistribute it and/or modify it under the terms of the GNU General Public *
14* License as published by the Free Software Foundation; either version 2 of *
15* the License or (at your option) any later version. This is online at: *
16* http://www.fsf.org/copyleft/gpl.html *
17* Please send any updates to: fred@gruntose.com *
18\*****************************************************************************/
19
20// Notes:
21//
22// the standard link structure in html is similar to this:
23// <a href="blahblah">Link Name and Launching Point</a>
24//
25// the standard we adopt for section titles is that it must be a heading
26// marker. that formatting looks like this, for example:
27// <h3 assorted_stuff>The Section Title:</h3>
28
29#include "bookmark_tree.h"
30
32#include <basis/astring.h>
33#include <basis/functions.h>
34#include <basis/guards.h>
36#include <filesystem/filename.h>
38#include <loggers/file_logger.h>
39#include <structures/stack.h>
41#include <textual/parser_bits.h>
42
43using namespace application;
44using namespace basis;
45using namespace filesystem;
46using namespace loggers;
47using namespace structures;
48using namespace textual;
49
50#undef BASE_LOG
51#define BASE_LOG(s) program_wide_logger::get().log(s, ALWAYS_PRINT)
52#undef LOG
53#define LOG(s) CLASS_EMERGENCY_LOG(program_wide_logger::get(), s)
54
55//#define DEBUG_LINK_PARSER
56 // uncomment for noisier run to seek problems.
57
59
60const int MAX_FILE_SIZE = 4 * MEGABYTE;
61 // this is the largest html file size we will process.
62
64
65// a macro that increments the position in the string and restarts the loop.
66#define INCREM_N_GO { curr_index++; continue; }
67
68// puts the current character on the intermediate string.
69#define ADD_INTERMEDIATE { \
70 char add_in = full_contents[curr_index]; \
71 if ( (add_in == '<') || (add_in == '>') ) { \
72 add_in = '-'; \
73 } \
74 intermediate_text += add_in; \
75}
76
77// returns a character in lower-case, if 'a' is in upper case.
78char normalize_char(char a)
79{
80 if ( (a >= 'A') && (a <= 'Z') ) return a + 'a' - 'A';
81 return a;
82}
83
84// returns true if the two characters are the same, ignoring upper/lower case.
85bool caseless_equals(char a, char b) { return normalize_char(a) == normalize_char(b); }
86
87// a macro that skips all characters until the specified one is seen.
88#define JUMP_TO_CHAR(to_find, save_them) { \
89 while ( (curr_index < full_contents.length()) \
90 && !caseless_equals(to_find, full_contents[curr_index]) ) { \
91 if (save_them) ADD_INTERMEDIATE; \
92 curr_index++; \
93 } \
94}
95
96// increments the state, the current character and restarts the loop.
97#define NEXT_STATE_INCREM { \
98 state = parsing_states(state+1); /* move forward in states. */ \
99 curr_index++; \
100 continue; \
101}
102
103// cleans out the disallowed characters in the string provided.
104#define CLEAN_UP_NAUGHTY(s) { \
105 while (s.replace("\n", " ")) {} \
106 while (s.replace("\r", "")) {} \
107 s.strip_spaces(); \
108}
109
110//was before the strip_spaces code above.
111/*
112 int indy = s.find("--"); \
113 while (non_negative(indy)) { \
114 s[indy] = ' '; / * replace the first dash with a space. * / \
115 for (int i = indy + 1; i < s.length(); i++) { \
116 if (s[i] != '-') break; \
117 s.zap(i, i); \
118 i--; \
119 } \
120 indy = s.find("--"); \
121 } \
122 while (s.replace(" ", " ")) {} \
123*/
124
125// cleans up underscores in areas that are supposed to be english.
126#define MAKE_MORE_ENGLISH(s) \
127 s.replace_all('_', ' ')
128
130{
131 for (int i = 0; i < to_edit.length(); i++) {
132 if (to_edit[i] != '<') continue;
133 // found a left bracket.
134 int indy = to_edit.find('>', i);
135 if (negative(indy)) return; // bail out, unexpected unmatched bracket.
136 to_edit.zap(i, indy);
137 i--; // skip back to reconsider current place.
138 }
139}
140
141// writes out the currently accumulated link info.
142#define WRITE_LINK { \
143 /* clean naughty characters out of the names. */ \
144 CLEAN_UP_NAUGHTY(url_string); \
145 CLEAN_UP_NAUGHTY(name_string); \
146 /* output a link in the HOOPLE format. */ \
147 astring to_write = "\"L\",\""; \
148 to_write += translate_web_chars(name_string); \
149 to_write += "\",\""; \
150 to_write += abbreviate_category(last_heading); \
151 to_write += "\",\""; \
152 to_write += translate_web_chars(url_string); \
153 to_write += "\"\n"; \
154 output_file.write(to_write); \
155 _link_count++; \
156}
157//was after second clean up naughty
158/*argh yuck... if (url_string.ends(name_string)) { \
159 / * handle the name being boring. replace with the intermediate text. * / \
160 MAKE_MORE_ENGLISH(intermediate_text); \
161 strain_out_html_codes(intermediate_text); \
162 CLEAN_UP_NAUGHTY(intermediate_text); \
163 if (intermediate_text.length()) \
164 name_string = intermediate_text; \
165 } \
166*/
167
168// writes out the current section in the HOOPLE format.
169// currently the parent category is set to Root.
170#define WRITE_SECTION { \
171 CLEAN_UP_NAUGHTY(last_heading); /* clean the name. */ \
172 /* output a category definition. */ \
173 astring to_write = "\"C\",\""; \
174 to_write += translate_web_chars(last_heading); \
175 to_write += "\",\""; \
176 to_write += abbreviate_category(last_parents.top()); \
177 to_write += "\"\n"; \
178 output_file.write(to_write); \
179 _category_count++; \
180}
181
182// clears our accumulator strings.
183#define RESET_STRINGS { \
184 url_string = astring::empty_string(); \
185 name_string = astring::empty_string(); \
186 intermediate_text = astring::empty_string(); \
187}
188
190
191class link_parser : public application_shell
192{
193public:
194 link_parser();
195 DEFINE_CLASS_NAME("link_parser");
196 virtual int execute();
197 int print_instructions(const filename &program_name);
198
199private:
200 int _link_count; // number of links.
201 int _category_count; // number of categories.
202
203 astring url_string; // the URL we've parsed.
204 astring name_string; // the name that we've parsed for the URL.
205 astring last_heading; // the last name that was set for a section.
206 stack<astring> last_parents; // the history of the parent names.
207 astring intermediate_text; // strings we saw before a link.
208
209 astring heading_num;
210 // this string form of a number tracks what kind of heading was started.
211
212 astring abbreviate_category(const astring &simplify);
213 // returns the inner category nickname if the category has one.
214
215 astring translate_web_chars(const astring &vervoom);
216 // translates a few web chars that are safe for csv back into their non-encoded form.
217};
218
220
221link_parser::link_parser()
223 _link_count(0),
224 _category_count(0),
225 last_heading("Root"),
226 last_parents()
227{
228 last_parents.push(last_heading); // make sure we have at least one level.
229}
230
231int link_parser::print_instructions(const filename &program_name)
232{
233 a_sprintf to_show("%s:\n\
234This program needs two filenames as command line parameters. The -i flag\n\
235is used to specify the input filename and the -o flag specifies the output\n\
236file to be created. The input file is expected to be an html file\n\
237containing links to assorted web sites. The links are gathered, along with\n\
238descriptive text that happens to be near them, to create a link database in\n\
239the HOOPLE link format and write it to the output file. HOOPLE link format\n\
240is basically a CSV file that defines the columns 1-4 for describing either\n\
241link categories (which support hierarchies) or actual links (i.e., URLs of\n\
242interest). The links are written to a CSV file in the standard HOOPLE link\n\
243The HOOPLE link format is documented here:\n\
244 http://feistymeow.org/guides/link_database/format_manifesto.txt\n\
245", program_name.basename().raw().s(), program_name.basename().raw().s());
246 program_wide_logger::get().log(to_show, ALWAYS_PRINT);
247 return 12;
248}
249
250astring link_parser::abbreviate_category(const astring &simplify)
251{
252 astring to_return;
253 astring name_portion;
254 bookmark_tree::break_name(simplify, name_portion, to_return);
255 if (!to_return) return name_portion;
256 return to_return;
257}
258
259astring link_parser::translate_web_chars(const astring &vervoom)
260{
261 astring to_return = vervoom;
262 to_return.replace_all("&amp;", "&");
263 to_return.replace_all("&auml;", "ä");
264 to_return.replace_all("&copy;", "(c)");
265 to_return.replace_all("&eacute;", "é");
266 to_return.replace_all("&laquo;", "--");
267 to_return.replace_all("&lsquo;", "'");
268 to_return.replace_all("&ldquo;", "'");
269 to_return.replace_all("&mdash;", "--");
270 to_return.replace_all("&ndash;", "--");
271 to_return.replace_all("&nbsp;", " ");
272 to_return.replace_all("&raquo;", "--");
273 to_return.replace_all("&rdquo;", "'");
274 to_return.replace_all("&rsquo;", "'");
275
276 to_return.replace_all("%7E", "~");
277 to_return.replace_all("%28", "(");
278 to_return.replace_all("%29", ")");
279 return to_return;
280}
281
282int link_parser::execute()
283{
284 FUNCDEF("main");
285 command_line cmds(_global_argc, _global_argv); // process the command line parameters.
286 astring input_filename; // we'll store our bookmarks file's name here.
287 astring output_filename; // where the processed marks go.
288 if (!cmds.get_value('i', input_filename, false))
289 return print_instructions(cmds.program_name());
290 if (!cmds.get_value('o', output_filename, false))
291 return print_instructions(cmds.program_name());
292
293 BASE_LOG(astring("input file: ") + input_filename);
294 BASE_LOG(astring("output file: ") + output_filename);
295
296 astring full_contents;
297 byte_filer input_file(input_filename, "r");
298 if (!input_file.good())
299 non_continuable_error(class_name(), func, "the input file could not be opened");
300 input_file.read(full_contents, MAX_FILE_SIZE);
301 input_file.close();
302
303 filename outname(output_filename);
304 if (outname.exists()) {
305 non_continuable_error(class_name(), func, astring("the output file ")
306 + output_filename + " already exists. It would be over-written if "
307 "we continued.");
308 }
309
310 byte_filer output_file(output_filename, "w");
311 if (!output_file.good())
312 non_continuable_error(class_name(), func, "the output file could not be opened");
313
314 enum parsing_states {
315 // the states below are order dependent; do not change the ordering!
316 SEEKING_LINK_START, // looking for the beginning of an html link.
317 SEEKING_HREF, // finding the href portion of the link.
318 GETTING_URL, // chowing on the URL portion of the link.
319 SEEKING_NAME, // finding the closing bracket of the <a ...>.
320 GETTING_NAME, // chowing down on characters in the link's name.
321 SEEKING_CLOSURE, // looking for the </a> to end the link.
322 // there is a discontinuity after SEEKING_CLOSURE, but then the following
323 // states are also order dependent.
324 SAW_TITLE_START, // the beginning of a section heading was seen.
325 GETTING_TITLE, // grabbing characters in the title.
326 // new text processing states.
327 SAW_NESTING_INCREASE, // a new nesting level has been achieved.
328 SAW_NESTING_DECREASE, // we exited from a former nesting level.
329 };
330
331 int curr_index = 0;
332 parsing_states state = SEEKING_LINK_START;
333 while (curr_index < full_contents.length()) {
334 switch (state) {
335 case SEEKING_LINK_START:
336 // if we don't see a less-than, then it's not the start of html code,
337 // so we'll ignore it for now.
338 if (full_contents[curr_index] != '<') {
341 }
342 // found a left angle bracket, so now we need to decided where to go next for parsing
343 // the html coming up.
344 curr_index++;
345 // see if this is a heading. if so, we can snag the heading name.
346 if (caseless_equals('h', full_contents[curr_index])) {
347#ifdef DEBUG_LINK_PARSER
348 LOG("into the '<h' case");
349#endif
350 // check that we're seeing a heading definition here.
351 char next = full_contents[curr_index + 1];
352 if ( (next >= '0') && (next <= '9') ) {
353 // we found our proper character for starting a heading. we need
354 // to jump into that state now. we'll leave the cursor at the
355 // beginning of the number.
356 state = SAW_TITLE_START;
358 }
359 }
360 // check if they're telling us a new indentation level of the type we care about.
361 if (caseless_equals('d', full_contents[curr_index])) {
362#ifdef DEBUG_LINK_PARSER
363 LOG("into the '<d' case");
364#endif
365 // see if they gave us a <dl> tag.
366 char next = full_contents[curr_index + 1];
367 if (caseless_equals(next, 'l')) {
368#ifdef DEBUG_LINK_PARSER
369 LOG("into the '<dl' case");
370#endif
371 state = SAW_NESTING_INCREASE;
373 }
374 }
375 // see if we can find a close for a nesting level.
376 if (caseless_equals('/', full_contents[curr_index])) {
377#ifdef DEBUG_LINK_PARSER
378 LOG("into the '</' case");
379#endif
380 // see if they gave us a <dl> tag.
381 if ( caseless_equals(full_contents[curr_index + 1], 'd')
382 && caseless_equals(full_contents[curr_index + 2], 'l') ) {
383#ifdef DEBUG_LINK_PARSER
384 LOG("into the '</dl' case");
385#endif
386 state = SAW_NESTING_DECREASE;
388 }
389 }
390 // see if it's not a link, and abandon ship if it's not, since that's the last option
391 // for html code that we parse.
392 if (!caseless_equals('a', full_contents[curr_index])) {
393#ifdef DEBUG_LINK_PARSER
394 LOG("into the not an '<a' case");
395#endif
396// intermediate_text += '<';
397 JUMP_TO_CHAR('>', false);
398 continue;
399 }
400#ifdef DEBUG_LINK_PARSER
401 LOG("into the final case, the '<a' case");
402#endif
403 // found an a, but make sure that's the only character in the word.
404 curr_index++;
405 if (!parser_bits::white_space(full_contents[curr_index])) {
406// intermediate_text += "<a";
407 JUMP_TO_CHAR('>', false);
408 continue;
409 }
410 // this looks like an address so find the start of the href.
412 break;
413 case SAW_NESTING_INCREASE:
414 last_parents.push(last_heading);
415#ifdef DEBUG_LINK_PARSER
416 LOG(a_sprintf("nesting inwards, new depth %d", last_parents.size()));
417#endif
418 JUMP_TO_CHAR('>', false);
419 state = SEEKING_LINK_START;
420 break;
421 case SAW_NESTING_DECREASE:
422 last_parents.pop();
423#ifdef DEBUG_LINK_PARSER
424 LOG(a_sprintf("nesting outwards, new depth %d", last_parents.size()));
425#endif
426 JUMP_TO_CHAR('>', false);
427 state = SEEKING_LINK_START;
428 break;
429 case SEEKING_HREF:
430 JUMP_TO_CHAR('h', false); // find the next 'h' for "href".
431 curr_index++;
432 if (!caseless_equals('r', full_contents[curr_index])) continue;
433 curr_index++;
434 if (!caseless_equals('e', full_contents[curr_index])) continue;
435 curr_index++;
436 if (!caseless_equals('f', full_contents[curr_index])) continue;
437 curr_index++;
438 if (full_contents[curr_index] != '=') continue;
439 curr_index++;
440 if (full_contents[curr_index] != '"') continue;
441 // whew, got through the word href and the assignment. the rest
442 // should all be part of the link.
444 break;
445 case GETTING_URL:
446 // as long as we don't see the closure of the quoted string for the
447 // href, then we can keep accumulating characters from it.
448 if (full_contents[curr_index] == '"') NEXT_STATE_INCREM;
449 url_string += full_contents[curr_index];
450 INCREM_N_GO; // keep chewing on it in this same state.
451 break;
452 case SEEKING_NAME:
453 JUMP_TO_CHAR('>', false); // find closing bracket.
454 NEXT_STATE_INCREM; // now start grabbing the name characters.
455 break;
456 case GETTING_NAME:
457 // we have to stop grabbing name characters when we spy a new code
458 // being started.
459 if (full_contents[curr_index] == '<') {
460 // if we see a closing command, then we assume it's the one we want.
461 if (full_contents[curr_index + 1] == '/')
463 // if we see html inside the name, we just throw it out.
464 JUMP_TO_CHAR('>', false);
465 curr_index++;
466 continue;
467 }
468 name_string += full_contents[curr_index];
469 INCREM_N_GO; // keep chewing on it in this same state.
470 break;
471 case SEEKING_CLOSURE:
472 JUMP_TO_CHAR('>', false); // find the closure of the html code.
473 // write the link out now.
475 // clean out our accumulated strings.
477 state = SEEKING_LINK_START;
479 break;
480 case SAW_TITLE_START:
481 heading_num = full_contents.substring(curr_index, curr_index);
482 JUMP_TO_CHAR('>', false);
483 NEXT_STATE_INCREM; // start eating the name.
484 break;
485 case GETTING_TITLE: {
486 int indy = full_contents.find('<', curr_index);
487 if (negative(indy)) {
488 state = SEEKING_LINK_START; // too weird, go back to start.
489 continue;
490 }
491 // push the last title if it differs from the top of the stack.
492 last_heading = full_contents.substring(curr_index, indy - 1);
494 JUMP_TO_CHAR('<', false); // now find the start of the header closure.
495 JUMP_TO_CHAR('>', false); // now find the end of the header closure.
497 state = SEEKING_LINK_START; // successfully found section name.
498 break;
499 }
500 default:
501 non_continuable_error(class_name(), func, "entered erroneous state!");
502 }
503 }
504
505 if (url_string.t()) WRITE_LINK;
506
507 output_file.close();
508
509 BASE_LOG(a_sprintf("wrote %d links in %d categories.", _link_count,
510 _category_count));
511
512 return 0;
513}
514
516
517HOOPLE_MAIN(link_parser, )
518
int print_instructions(bool good, const astring &program_name)
Definition checker.cpp:45
The application_shell is a base object for console programs.
virtual int execute()=0
< retrieves the command line from the /proc hierarchy on linux.
a_sprintf is a specialization of astring that provides printf style support.
Definition astring.h:440
Provides a dynamically resizable ASCII character string.
Definition astring.h:35
const char * s() const
synonym for observe. the 's' stands for "string", if that helps.
Definition astring.h:113
virtual void zap(int start, int end)
Deletes the characters between "start" and "end" inclusively.
Definition astring.cpp:524
bool substring(astring &target, int start, int end) const
a version that stores the substring in an existing "target" string.
Definition astring.cpp:868
bool replace_all(char to_replace, char new_char)
changes all occurrences of "to_replace" with "new_char".
Definition astring.cpp:932
int length() const
Returns the current length of the string.
Definition astring.cpp:132
int find(char to_find, int position=0, bool reverse=false) const
Locates "to_find" in "this".
Definition astring.cpp:577
virtual outcome log(const base_string &info, int filter)=0
writes the information in "info" to the logger using the "filter".
static void break_name(const basis::astring &to_break, basis::astring &name, basis::astring &nick)
Provides file managment services using the standard I/O support.
Definition byte_filer.h:32
Provides operations commonly needed on file names.
Definition filename.h:64
const basis::astring & raw() const
returns the astring that we're holding onto for the path.
Definition filename.cpp:97
filename basename() const
returns the base of the filename; no directory.
Definition filename.cpp:385
static loggers::standard_log_base & get()
Provided by the startup code within each application for logging.
An abstraction that represents a stack data structure.
Definition stack.h:30
static bool white_space(char to_check)
returns true if the character "to_check" is considered a white space.
#define non_continuable_error(c, f, i)
an extra piece of information used, if available, in bounds_halt below.
#define DEFINE_CLASS_NAME(objname)
Defines the name of a class by providing a couple standard methods.
Definition enhance_cpp.h:42
#define FUNCDEF(func_in)
FUNCDEF sets the name of a function (and plugs it into the callstack).
Definition enhance_cpp.h:54
Provides macros that implement the 'main' program of an application.
#define HOOPLE_MAIN(obj_name, obj_args)
options that should work for most unix and linux apps.
Definition hoople_main.h:61
Implements an application lock to ensure only one is running at once.
char ** _global_argv
The guards collection helps in testing preconditions and reporting errors.
Definition array.h:30
const int MEGABYTE
Number of bytes in a megabyte.
bool negative(const type &a)
negative returns true if "a" is less than zero.
Definition functions.h:43
A platform independent way to obtain the timestamp of a file.
A logger that sends to the console screen using the standard output device.
A dynamic container class that holds any kind of object via pointers.
Definition amorph.h:55