feisty meow concerns codebase 2.140
marks_checker.cpp
Go to the documentation of this file.
1/*****************************************************************************\
2* *
3* Name : marks_checker *
4* Author : Chris Koeritz *
5* *
6* Purpose: *
7* *
8* Checks on the existence of the links listed in a HOOPLE format link *
9* database and reports the bad ones. *
10* *
11*******************************************************************************
12* Copyright (c) 2005-$now By Author. This program is free software; you can *
13* redistribute it and/or modify it under the terms of the GNU General Public *
14* License as published by the Free Software Foundation; either version 2 of *
15* the License or (at your option) any later version. This is online at: *
16* http://www.fsf.org/copyleft/gpl.html *
17* Please send any updates to: fred@gruntose.com *
18\*****************************************************************************/
19
20#include "bookmark_tree.h"
21
22#include <algorithms/sorts.h>
26#include <basis/astring.h>
27#include <basis/functions.h>
28#include <basis/guards.h>
29#include <basis/mutex.h>
31#include <filesystem/filename.h>
32#include <loggers/file_logger.h>
33#include <mathematics/chaos.h>
34#include <processes/ethread.h>
38#include <textual/parser_bits.h>
39#include <timely/time_control.h>
40
41#include <curl/curl.h>
42#include <signal.h>
43#include <stdlib.h>
44
45using namespace algorithms;
46using namespace application;
47using namespace basis;
48using namespace filesystem;
49using namespace loggers;
50using namespace nodes;
51using namespace mathematics;
52using namespace processes;
53using namespace structures;
54using namespace textual;
55using namespace timely;
56
57//#define DEBUG_MARKS
58 // uncomment to have more debugging noise.
59
60#undef BASE_LOG
61#define BASE_LOG(s) program_wide_logger::get().log(astring(s), ALWAYS_PRINT)
62#undef LOG
63#define LOG(s) CLASS_EMERGENCY_LOG(program_wide_logger::get(), \
64 a_sprintf("line %d: ", _categories._line_number) + s)
65
66const int PAUSEY_SNOOZE = 200;
67 // how long we sleep if there are too many threads running already.
68
69const int MAXIMUM_THREADS = 14;
70 // we allow this many simultaneous web checks at a time.
71
72const int MAXIMUM_READ = 1008;
73 // we only download this much of the link. this avoids huge downloads of
74 // very large sites.
75
76const int MAXIMUM_ATTEMPTS = 2;
77 // we'll retry the check if we get an actual error instead of an http error
78 // code. when a name can't be found in the DNS, it sometimes comes back
79 // shortly after it was checked. if we see we can't reach the domain after
80 // this many tries, then we give up on the address.
81
82const int TIME_PER_REQUEST_IN_SEC = 60 * 6;
83 // limit our requests to this long of a period. then we will not be
84 // stalled forever by uncooperative websites.
85
86const char *FAKE_AGENT_STRING = "FredWeb/7.0 (X11; U; Linux i686; "
87 "en-US; rv:1.8.19) Flecko/20081031";
88 // we use this as our agent type, since some sites won't treat us fairly
89 // if they think we're robots when we're checking their site health.
90//still true?
91 // for example (ahem!), the usa today websites.
92
94
95class safe_int_array
96{
97public:
98 safe_int_array() : _lock(), _list(0) {}
99
100 void add(int to_add) {
102 auto_synchronizer l(_lock);
103 _list += to_add;
104 }
105
106 int length() {
107 auto_synchronizer l(_lock);
108 return _list.length();
109 }
110
111 basis::int_array make_copy() {
112 auto_synchronizer l(_lock);
113 return _list;
114 }
115
116private:
117 basis::mutex _lock;
118 basis::int_array _list;
119};
120
122
123class marks_checker : public application_shell
124{
125public:
126 marks_checker()
127 : application_shell(), _check_redirection(false),
128 _max_threads(MAXIMUM_THREADS), _null_file(filename::null_device(), "w")
129 {}
130
131 DEFINE_CLASS_NAME("marks_checker");
132 virtual int execute();
133 int print_instructions(const filename &program_name);
134
135 int test_all_links();
136 // goes through the tree of links and tests them all.
137
138 int check_link(const astring &url, astring &error_msg);
139 // synchronously checks the "url" for health. the return value is zero
140 // on success or an HTTP error code on failure.
141
142 void write_new_files();
143 // writes out the two new files given the info accumulated so far.
144
145private:
146 bookmark_tree _categories; // our tree of categories.
147 safe_int_array _bad_lines; // lines with bad contents.
148 thread_cabinet _checkers; // threads checking on links.
149 astring _input_filename; // we'll store our link database name here.
150 astring _output_filename; // where the list of good links is stored.
151 astring _bad_link_filename; // garbage dump of bad links.
152 bool _check_redirection; // true if redirection is disallowed.
153 int _max_threads; // the most threads we'll allow at once.
154 byte_filer _null_file; // we'll use this for trashing output data.
155
156 static void handle_OS_signal(int sig_id);
157 // handles break signals from the user.
158};
159
161
162class checking_thread : public ethread
163{
164public:
165 checking_thread(const link_record &link_info, safe_int_array &bad_lines,
166 marks_checker &checker)
167 : ethread(), _bad_lines(bad_lines), _checker(checker), _info(link_info) {}
168
169 void perform_activity(void *formal(ptr)) {
170 astring message;
171 int ret = _checker.check_link(_info._url, message);
172 if (ret != 0) {
173 astring complaint = a_sprintf("Bad Link at line %d:", _info._uid)
175 const astring spacer(' ', 4);
176 complaint += spacer + _info._url += parser_bits::platform_eol_to_chars();
177 complaint += spacer + _info._description += parser_bits::platform_eol_to_chars();
178 complaint += spacer + "error: " += message;
179 BASE_LOG(complaint);
180if ( (_info._uid> 100000) || (_info._uid < 0) ) {
181BASE_LOG(a_sprintf("somehow got bogus line number! %d", _info._uid));
182return;
183}
184 _bad_lines.add(_info._uid); // list ours as bad.
185 }
186 }
187
188private:
189 safe_int_array &_bad_lines;
190 marks_checker &_checker;
191 link_record _info;
192};
193
195
196int marks_checker::print_instructions(const filename &program_name)
197{
198 a_sprintf to_show("%s:\n\
199This program needs three filenames as command line parameters. The -i flag\n\
200is used to specify the input filename. The -o flag specifies the file where\n\
201where the good links will be written. The -b flag specifies the file where\n\
202the bad links are written. The optional flag --no-redirs can be used to\n\
203disallow web-site redirection, which will catch when the site has changed\n\
204its location. Note that redirection is not necessarily an error, but it\n\
205instead may just be a link that needs its URL modified. It is recommended\n\
206that you omit this flag in early runs, in order to only locate definitely\n\
207dead links. Then later checking runs can find any sites that were redirected\n\
208or being routed to a dead link page which doesn't provide an error code.\n\
209The optional flag --threads with a parameter will set the maximum number of\n\
210threads that will simultaneously check on links.\n\
211The input file is expected to be in the HOOPLE link database format.\n\
212The HOOPLE link format is documented here:\n\
213 http://feistymeow.org/guides/link_database/format_manifesto.txt\n\
214", program_name.basename().raw().s(), program_name.basename().raw().s());
215 program_wide_logger::get().log(to_show, ALWAYS_PRINT);
216 return 12;
217}
218
219// this function just eats any data it's handed.
220size_t data_sink(void *formal(ptr), size_t size, size_t number, void *formal(stream))
221{ return size * number; }
222
223int marks_checker::check_link(const astring &url, astring &error_msg)
224{
225 int to_return = -1;
226
227 CURL *cur = curl_easy_init();
228
229 curl_easy_setopt(cur, CURLOPT_URL, url.s()); // set the URL itself.
230
231 curl_easy_setopt(cur, CURLOPT_SSL_VERIFYPEER, 0);
232 // don't verify SSL certificates.
233 curl_easy_setopt(cur, CURLOPT_MAXFILESIZE, MAXIMUM_READ);
234 // limit the download size; causes size errors, which we elide to success.
235 curl_easy_setopt(cur, CURLOPT_NOSIGNAL, 1);
236 // don't use signals since it interferes with sleep.
237 curl_easy_setopt(cur, CURLOPT_TIMEOUT, TIME_PER_REQUEST_IN_SEC);
238 // limit time allowed per operation.
239 curl_easy_setopt(cur, CURLOPT_AUTOREFERER, true);
240 // automatically fill in the referer field when redirected.
241
242 curl_easy_setopt(cur, CURLOPT_WRITEDATA, _null_file.file_handle());
243 // set the file handle where we want our downloaded data to go. since
244 // we're just checking the links, this goes right to the trash.
245 curl_easy_setopt(cur, CURLOPT_WRITEFUNCTION, data_sink);
246 // set the function which will be given all the downloaded data.
247
248 curl_easy_setopt(cur, CURLOPT_USERAGENT, FAKE_AGENT_STRING);
249 // fake being a browser here since otherwise we get no respect.
250
251 curl_easy_setopt(cur, CURLOPT_FTPLISTONLY, 1);
252 // get only a simple list of files, which allows us to hit ftp sites
253 // properly. if the normal curl mode is used, we get nothing.
254
255 if (_check_redirection) {
256 // attempting to quash redirects as being valid.
257 curl_easy_setopt(cur, CURLOPT_FOLLOWLOCATION, 1); // follow redirects.
258 curl_easy_setopt(cur, CURLOPT_MAXREDIRS, 0); // allow zero redirects.
259 }
260
261 int tries = 0;
262 while (tries++ < MAXIMUM_ATTEMPTS) {
263
264 // we do the error message again every time, since it gets shrunk after
265 // the web page check and is no longer available where it was in memory.
266 error_msg = astring(' ', CURL_ERROR_SIZE + 5);
267 curl_easy_setopt(cur, CURLOPT_ERRORBUFFER, error_msg.s());
268
269 // set the error message buffer so we know what happened.
270
271 // try to lookup the web page we've been given.
272 to_return = curl_easy_perform(cur);
273
274 error_msg.shrink(); // just use the message without extra spaces.
275
276 // we turn file size errors into non-errors, since we have set a very
277 // low file size in order to avoid downloading too much. we really just
278 // want to check links, not download their contents.
279 if (to_return == CURLE_FILESIZE_EXCEEDED) to_return = 0;
280
281 if (!to_return) {
282 // supposedly this is a success, but let's check the result code.
283 long result;
284 curl_easy_getinfo(cur, CURLINFO_RESPONSE_CODE, &result);
285 if (result >= 400) {
286 error_msg = a_sprintf("received http failure code %d", result);
287 to_return = -1;
288 }
289 break; // this was a successful result, a zero outcome from perform.
290 }
291
292 time_control::sleep_ms(10 * SECOND_ms); // give it a few more seconds...
293 }
294
295 curl_easy_cleanup(cur);
296
297 return to_return;
298}
299
300int marks_checker::test_all_links()
301{
302 FUNCDEF("test_all_links");
303 // traverse the tree in prefix order.
304 tree::iterator itty = _categories.access_root().start(tree::prefix);
305 tree *curr = NULL_POINTER;
306 while ( (curr = itty.next()) ) {
307 inner_mark_tree *nod = dynamic_cast<inner_mark_tree *>(curr);
308 if (!nod)
309 non_continuable_error(static_class_name(), func, "failed to cast a tree node");
310 // iterate on all the links at this node to check them.
311 for (int i = 0; i < nod->_links.elements(); i++) {
312 link_record *lin = nod->_links.borrow(i);
313 if (!lin->_url) continue; // not a link.
314
315 while (_checkers.threads() > _max_threads) {
317 _checkers.clean_debris();
318 }
319
320 checking_thread *new_thread = new checking_thread(*lin, _bad_lines,
321 *this);
322 unique_int id = _checkers.add_thread(new_thread, true, NULL_POINTER);
323 }
324 }
325
326BASE_LOG("... finished iterating on tree.");
327
328 // now wait until all the threads are finished.
329 while (_checkers.threads()) {
331 _checkers.clean_debris();
332 }
333
334BASE_LOG("... finished waiting for all threads.");
335
336 return 0;
337}
338
339void marks_checker::write_new_files()
340{
341 byte_filer input_file(_input_filename, "r");
342 byte_filer output_file(_output_filename, "w");
343 byte_filer badness_file(_bad_link_filename, "w");
344
345 basis::int_array badness = _bad_lines.make_copy();
346 shell_sort<int>(badness.access(), badness.length());
347
348 BASE_LOG("bad links are on lines:");
349 astring bad_list;
350 for (int i = 0; i < badness.length(); i++) {
351 bad_list += a_sprintf("%d, ", badness[i]);
352 }
353 BASE_LOG(bad_list);
354
355 astring buffer;
356 int curr_line = 0;
357 while (!input_file.eof()) {
358 curr_line++;
359 while (badness.length() && (badness[0] < curr_line) ) {
360 BASE_LOG(a_sprintf("whacking too low line number: %d", badness[0]));
361 badness.zap(0, 0);
362 }
363 input_file.getline(buffer, 2048);
364//make that a constant.
365 if (badness.length() && (badness[0] == curr_line)) {
366 // we seem to have found a bad line.
367 badness_file.write(buffer);
368 badness.zap(0, 0); // remove the current line number.
369 } else {
370 // this is a healthy line.
371 output_file.write(buffer);
372 }
373
374 }
375 input_file.close();
376 output_file.close();
377 badness_file.close();
378}
379
380marks_checker *main_program = NULL_POINTER;
381
382void marks_checker::handle_OS_signal(int formal(sig_id))
383{
384 signal(SIGINT, SIG_IGN); // turn off that signal for now.
385 BASE_LOG("caught break signal... now writing files.");
386 if (main_program) main_program->write_new_files();
387 BASE_LOG("exiting after handling break.");
389 exit(0);
390}
391
392int marks_checker::execute()
393{
394 FUNCDEF("execute");
396
397 main_program = this; // used by our signal handler.
398
399 command_line cmds(_global_argc, _global_argv); // process the command line parameters.
400 if (!cmds.get_value('i', _input_filename, false))
401 return print_instructions(cmds.program_name());
402 if (!cmds.get_value('o', _output_filename, false))
403 return print_instructions(cmds.program_name());
404 if (!cmds.get_value('b', _bad_link_filename, false))
405 return print_instructions(cmds.program_name());
406
408
409 // optional flag for checking website redirection.
410 if (cmds.get_value("no-redirs", temp, false)) {
411 BASE_LOG("Enabling redirection checking: redirected web sites are reported as bad.");
412 _check_redirection = true;
413 }
414 // optional flag for number of threads.
415 astring threads;
416 if (cmds.get_value("threads", threads, false)) {
417 _max_threads = threads.convert(0);
418 BASE_LOG(a_sprintf("Maximum threads allowed=%d", _max_threads));
419 }
420
421 BASE_LOG(astring("input file: ") + _input_filename);
422 BASE_LOG(astring("output file: ") + _output_filename);
423 BASE_LOG(astring("bad link file: ") + _bad_link_filename);
424
425//hmmm: check if output file already exists.
426//hmmm: check if bad file already exists.
427
428LOG("before reading input...");
429
430 int ret = _categories.read_csv_file(_input_filename);
431 if (ret) return ret; // failure during read means we can't do much.
432
433LOG("after reading input...");
434
435 signal(SIGINT, handle_OS_signal);
436 // hook the break signal so we can still do part of the job if they
437 // interrupt us.
438
439 curl_global_init(CURL_GLOBAL_ALL); // crank up the cURL engine.
440
441 ret = test_all_links();
442
443 write_new_files();
445
446 curl_global_cleanup(); // shut down cURL engine again.
447
448 return 0;
449}
450
452
453HOOPLE_MAIN(marks_checker, )
454
#define BASE_LOG(s)
int print_instructions(bool good, const astring &program_name)
Definition checker.cpp:45
The application_shell is a base object for console programs.
virtual int execute()=0
< retrieves the command line from the /proc hierarchy on linux.
a_sprintf is a specialization of astring that provides printf style support.
Definition astring.h:440
contents * access()
A non-constant access of the underlying C-array. BE REALLY CAREFUL.
Definition array.h:175
int length() const
Returns the current reported length of the allocated C array.
Definition array.h:115
outcome zap(int start, int end)
Deletes from "this" the objects inclusively between "start" and "end".
Definition array.h:769
Provides a dynamically resizable ASCII character string.
Definition astring.h:35
const char * s() const
synonym for observe. the 's' stands for "string", if that helps.
Definition astring.h:113
int convert(int default_value) const
Converts the string into a corresponding integer.
Definition astring.cpp:760
void shrink()
changes all occurrences of "to_replace" into "new_string".
Definition astring.cpp:168
auto_synchronizer simplifies concurrent code by automatically unlocking.
Definition mutex.h:113
virtual outcome log(const base_string &info, int filter)=0
writes the information in "info" to the logger using the "filter".
A simple object that wraps a templated array of ints.
Definition array.h:275
Provides file managment services using the standard I/O support.
Definition byte_filer.h:32
Provides operations commonly needed on file names.
Definition filename.h:64
const basis::astring & raw() const
returns the astring that we're holding onto for the path.
Definition filename.cpp:97
filename basename() const
returns the base of the filename; no directory.
Definition filename.cpp:385
listo_links _links
static loggers::standard_log_base & get()
Provided by the startup code within each application for logging.
tree * next()
Returns a pointer to the next tree in the direction of traversal.
Definition tree.cpp:257
A dynamically linked tree with an arbitrary number of branches.
Definition tree.h:40
@ prefix
Definition tree.h:94
Provides a platform-independent object for adding threads to a program.
Definition ethread.h:36
virtual void perform_activity(void *thread_data)=0
< invoked just after after start(), when the OS thread is created.
Manages a collection of threads.
int elements() const
the maximum number of elements currently allowed in this amorph.
Definition amorph.h:66
contents * borrow(int field)
Returns a pointer to the information at the index "field".
Definition amorph.h:448
A unique identifier based on integers.
Definition unique_id.h:97
static const char * platform_eol_to_chars()
provides the characters that make up this platform's line ending.
static void sleep_ms(basis::un_int msec)
a system independent name for a forced snooze measured in milliseconds.
#define SETUP_COMBO_LOGGER
a macro that retasks the program-wide logger as a combo_logger.
#define non_continuable_error(c, f, i)
an extra piece of information used, if available, in bounds_halt below.
#define formal(parameter)
This macro just eats what it's passed; it marks unused formal parameters.
Definition definitions.h:48
#define NULL_POINTER
The value representing a pointer to nothing.
Definition definitions.h:32
#define DEFINE_CLASS_NAME(objname)
Defines the name of a class by providing a couple standard methods.
Definition enhance_cpp.h:42
#define FUNCDEF(func_in)
FUNCDEF sets the name of a function (and plugs it into the callstack).
Definition enhance_cpp.h:54
Provides macros that implement the 'main' program of an application.
#define HOOPLE_MAIN(obj_name, obj_args)
options that should work for most unix and linux apps.
Definition hoople_main.h:61
const int MAXIMUM_READ
size_t data_sink(void *formal(ptr), size_t size, size_t number, void *formal(stream))
#define LOG(s)
const int MAXIMUM_ATTEMPTS
#define BASE_LOG(s)
const char * FAKE_AGENT_STRING
const int PAUSEY_SNOOZE
marks_checker * main_program
const int TIME_PER_REQUEST_IN_SEC
const int MAXIMUM_THREADS
Implements an application lock to ensure only one is running at once.
char ** _global_argv
The guards collection helps in testing preconditions and reporting errors.
Definition array.h:30
const int SECOND_ms
Number of milliseconds in a second.
A platform independent way to obtain the timestamp of a file.
A logger that sends to the console screen using the standard output device.
An extension to floating point primitives providing approximate equality.
Definition averager.h:21
A dynamic container class that holds any kind of object via pointers.
Definition amorph.h:55
#include <time.h>
#define static_class_name()
Aids in achievement of platform independence.