44 using namespace basis;
51 #define BASE_LOG(s) program_wide_logger::get().log(s, ALWAYS_PRINT)
53 #define LOG(s) CLASS_EMERGENCY_LOG(program_wide_logger::get(), s)
66 #define INCREM_N_GO { curr_index++; continue; }
69 #define ADD_INTERMEDIATE { \
70 char add_in = full_contents[curr_index]; \
71 if ( (add_in == '<') || (add_in == '>') ) { \
74 intermediate_text += add_in; \
80 if ( (a >=
'A') && (a <=
'Z') )
return a +
'a' -
'A';
88 #define JUMP_TO_CHAR(to_find, save_them) { \
89 while ( (curr_index < full_contents.length()) \
90 && !caseless_equals(to_find, full_contents[curr_index]) ) { \
91 if (save_them) ADD_INTERMEDIATE; \
97 #define NEXT_STATE_INCREM { \
98 state = parsing_states(state+1); \
104 #define CLEAN_UP_NAUGHTY(s) { \
105 while (s.replace("\n", " ")) {} \
106 while (s.replace("\r", "")) {} \
126 #define MAKE_MORE_ENGLISH(s) \
127 s.replace_all('_', ' ')
131 for (
int i = 0; i < to_edit.
length(); i++) {
132 if (to_edit[i] !=
'<')
continue;
134 int indy = to_edit.
find(
'>', i);
136 to_edit.
zap(i, indy);
142 #define WRITE_LINK { \
144 CLEAN_UP_NAUGHTY(url_string); \
145 CLEAN_UP_NAUGHTY(name_string); \
147 astring to_write = "\"L\",\""; \
148 to_write += translate_web_chars(name_string); \
149 to_write += "\",\""; \
150 to_write += abbreviate_category(last_heading); \
151 to_write += "\",\""; \
152 to_write += translate_web_chars(url_string); \
153 to_write += "\"\n"; \
154 output_file.write(to_write); \
170 #define WRITE_SECTION { \
171 CLEAN_UP_NAUGHTY(last_heading); \
173 astring to_write = "\"C\",\""; \
174 to_write += translate_web_chars(last_heading); \
175 to_write += "\",\""; \
176 to_write += abbreviate_category(last_parents.top()); \
177 to_write += "\"\n"; \
178 output_file.write(to_write); \
183 #define RESET_STRINGS { \
184 url_string = astring::empty_string(); \
185 name_string = astring::empty_string(); \
186 intermediate_text = astring::empty_string(); \
196 virtual int execute();
221 link_parser::link_parser()
225 last_heading(
"Root"),
228 last_parents.push(last_heading);
234 This program needs two filenames as command line parameters. The -i flag\n\
235 is used to specify the input filename and the -o flag specifies the output\n\
236 file to be created. The input file is expected to be an html file\n\
237 containing links to assorted web sites. The links are gathered, along with\n\
238 descriptive text that happens to be near them, to create a link database in\n\
239 the HOOPLE link format and write it to the output file. HOOPLE link format\n\
240 is basically a CSV file that defines the columns 1-4 for describing either\n\
241 link categories (which support hierarchies) or actual links (i.e., URLs of\n\
242 interest). The links are written to a CSV file in the standard HOOPLE link\n\
243 The HOOPLE link format is documented here:\n\
244 http://feistymeow.org/guides/link_database/format_manifesto.txt\n\
246 program_wide_logger::get().log(to_show, ALWAYS_PRINT);
255 if (!to_return)
return name_portion;
282 int link_parser::execute()
288 if (!cmds.get_value(
'i', input_filename,
false))
290 if (!cmds.get_value(
'o', output_filename,
false))
298 if (!input_file.good())
304 if (outname.exists()) {
306 + output_filename +
" already exists. It would be over-written if "
311 if (!output_file.good())
314 enum parsing_states {
327 SAW_NESTING_INCREASE,
328 SAW_NESTING_DECREASE,
332 parsing_states state = SEEKING_LINK_START;
333 while (curr_index < full_contents.
length()) {
335 case SEEKING_LINK_START:
338 if (full_contents[curr_index] !=
'<') {
347 #ifdef DEBUG_LINK_PARSER
348 LOG(
"into the '<h' case");
351 char next = full_contents[curr_index + 1];
352 if ( (next >=
'0') && (next <=
'9') ) {
356 state = SAW_TITLE_START;
362 #ifdef DEBUG_LINK_PARSER
363 LOG(
"into the '<d' case");
366 char next = full_contents[curr_index + 1];
368 #ifdef DEBUG_LINK_PARSER
369 LOG(
"into the '<dl' case");
371 state = SAW_NESTING_INCREASE;
377 #ifdef DEBUG_LINK_PARSER
378 LOG(
"into the '</' case");
383 #ifdef DEBUG_LINK_PARSER
384 LOG(
"into the '</dl' case");
386 state = SAW_NESTING_DECREASE;
393 #ifdef DEBUG_LINK_PARSER
394 LOG(
"into the not an '<a' case");
400 #ifdef DEBUG_LINK_PARSER
401 LOG(
"into the final case, the '<a' case");
405 if (!parser_bits::white_space(full_contents[curr_index])) {
413 case SAW_NESTING_INCREASE:
414 last_parents.push(last_heading);
415 #ifdef DEBUG_LINK_PARSER
416 LOG(
a_sprintf(
"nesting inwards, new depth %d", last_parents.size()));
419 state = SEEKING_LINK_START;
421 case SAW_NESTING_DECREASE:
423 #ifdef DEBUG_LINK_PARSER
424 LOG(
a_sprintf(
"nesting outwards, new depth %d", last_parents.size()));
427 state = SEEKING_LINK_START;
438 if (full_contents[curr_index] !=
'=')
continue;
440 if (full_contents[curr_index] !=
'"')
continue;
449 url_string += full_contents[curr_index];
459 if (full_contents[curr_index] ==
'<') {
461 if (full_contents[curr_index + 1] ==
'/')
468 name_string += full_contents[curr_index];
471 case SEEKING_CLOSURE:
477 state = SEEKING_LINK_START;
480 case SAW_TITLE_START:
481 heading_num = full_contents.
substring(curr_index, curr_index);
485 case GETTING_TITLE: {
486 int indy = full_contents.
find(
'<', curr_index);
488 state = SEEKING_LINK_START;
492 last_heading = full_contents.
substring(curr_index, indy - 1);
497 state = SEEKING_LINK_START;
int print_instructions(bool good, const astring &program_name)
The application_shell is a base object for console programs.
a_sprintf is a specialization of astring that provides printf style support.
Provides a dynamically resizable ASCII character string.
const char * s() const
synonym for observe. the 's' stands for "string", if that helps.
virtual void zap(int start, int end)
Deletes the characters between "start" and "end" inclusively.
bool substring(astring &target, int start, int end) const
a version that stores the substring in an existing "target" string.
bool replace_all(char to_replace, char new_char)
changes all occurrences of "to_replace" with "new_char".
int length() const
Returns the current length of the string.
int find(char to_find, int position=0, bool reverse=false) const
Locates "to_find" in "this".
static void break_name(const basis::astring &to_break, basis::astring &name, basis::astring &nick)
Provides file managment services using the standard I/O support.
Provides operations commonly needed on file names.
const basis::astring & raw() const
returns the astring that we're holding onto for the path.
filename basename() const
returns the base of the filename; no directory.
#define non_continuable_error(c, f, i)
an extra piece of information used, if available, in bounds_halt below.
#define DEFINE_CLASS_NAME(objname)
Defines the name of a class by providing a couple standard methods.
#define FUNCDEF(func_in)
FUNCDEF sets the name of a function (and plugs it into the callstack).
Provides macros that implement the 'main' program of an application.
#define HOOPLE_MAIN(obj_name, obj_args)
options that should work for most unix and linux apps.
char normalize_char(char a)
#define JUMP_TO_CHAR(to_find, save_them)
void strain_out_html_codes(astring &to_edit)
#define NEXT_STATE_INCREM
bool caseless_equals(char a, char b)
Implements an application lock to ensure only one is running at once.
The guards collection helps in testing preconditions and reporting errors.
const int MEGABYTE
Number of bytes in a megabyte.
bool negative(const type &a)
negative returns true if "a" is less than zero.
A platform independent way to obtain the timestamp of a file.
A logger that sends to the console screen using the standard output device.
A dynamic container class that holds any kind of object via pointers.