1 /*****************************************************************************\
3 * Name : marks_checker *
4 * Author : Chris Koeritz *
8 * Checks on the existence of the links listed in a HOOPLE format link *
9 * database and reports the bad ones. *
11 *******************************************************************************
12 * Copyright (c) 2005-$now By Author. This program is free software; you can *
13 * redistribute it and/or modify it under the terms of the GNU General Public *
14 * License as published by the Free Software Foundation; either version 2 of *
15 * the License or (at your option) any later version. This is online at: *
16 * http://www.fsf.org/copyleft/gpl.html *
17 * Please send any updates to: fred@gruntose.com *
18 \*****************************************************************************/
20 #include "bookmark_tree.h"
22 #include <application/hoople_main.h>
23 #include <application/command_line.h>
24 #include <basis/astring.h>
25 #include <basis/functions.h>
26 #include <basis/guards.h>
27 #include <basis/mutex.h>
28 #include <filesystem/byte_filer.h>
29 #include <filesystem/filename.h>
30 #include <loggers/file_logger.h>
31 #include <mathematics/chaos.h>
32 #include <processes/ethread.h>
33 #include <processes/thread_cabinet.h>
34 #include <structures/static_memory_gremlin.h>
35 #include <structures/unique_id.h>
36 #include <textual/parser_bits.h>
37 #include <timely/time_control.h>
39 #include <curl/curl.h>
42 #include "../../library/algorithms/sorts.h"
44 using namespace algorithms;
45 using namespace application;
46 using namespace basis;
47 using namespace filesystem;
48 using namespace loggers;
49 using namespace nodes;
50 using namespace mathematics;
51 using namespace processes;
52 using namespace structures;
53 using namespace textual;
54 using namespace timely;
57 // uncomment to have more debugging noise.
60 #define BASE_LOG(s) program_wide_logger::get().log(astring(s), ALWAYS_PRINT)
62 #define LOG(s) CLASS_EMERGENCY_LOG(program_wide_logger::get(), \
63 a_sprintf("line %d: ", _categories._line_number) + s)
65 const int PAUSEY_SNOOZE = 200;
66 // how long we sleep if there are too many threads running already.
68 const int MAXIMUM_THREADS = 14;
69 // we allow this many simultaneous web checks at a time.
71 const int MAXIMUM_READ = 1008;
72 // we only download this much of the link. this avoids huge downloads of
75 const int MAXIMUM_ATTEMPTS = 2;
76 // we'll retry the check if we get an actual error instead of an http error
77 // code. when a name can't be found in the DNS, it sometimes comes back
78 // shortly after it was checked. if we see we can't reach the domain after
79 // this many tries, then we give up on the address.
81 const int TIME_PER_REQUEST_IN_SEC = 60 * 6;
82 // limit our requests to this long of a period. then we will not be
83 // stalled forever by uncooperative websites.
85 const char *FAKE_AGENT_STRING = "FredWeb/7.0 (X11; U; Linux i686; "
86 "en-US; rv:1.8.19) Flecko/20081031";
87 // we use this as our agent type, since some sites won't treat us fairly
88 // if they think we're robots when we're checking their site health.
90 // for example (ahem!), the usa today websites.
92 ////////////////////////////////////////////////////////////////////////////
97 safe_int_array() : _lock(), _list(0) {}
99 void add(int to_add) {
100 ///BASE_LOG(a_sprintf("adding %d to list", to_add));
101 auto_synchronizer l(_lock);
106 auto_synchronizer l(_lock);
107 return _list.length();
110 basis::int_array make_copy() {
111 auto_synchronizer l(_lock);
117 basis::int_array _list;
120 ////////////////////////////////////////////////////////////////////////////
122 class marks_checker : public application_shell
126 : application_shell(), _check_redirection(false),
127 _max_threads(MAXIMUM_THREADS), _null_file(filename::null_device(), "w")
130 DEFINE_CLASS_NAME("marks_checker");
131 virtual int execute();
132 int print_instructions(const filename &program_name);
134 int test_all_links();
135 // goes through the tree of links and tests them all.
137 int check_link(const astring &url, astring &error_msg);
138 // synchronously checks the "url" for health. the return value is zero
139 // on success or an HTTP error code on failure.
141 void write_new_files();
142 // writes out the two new files given the info accumulated so far.
145 bookmark_tree _categories; // our tree of categories.
146 safe_int_array _bad_lines; // lines with bad contents.
147 thread_cabinet _checkers; // threads checking on links.
148 astring _input_filename; // we'll store our link database name here.
149 astring _output_filename; // where the list of good links is stored.
150 astring _bad_link_filename; // garbage dump of bad links.
151 bool _check_redirection; // true if redirection is disallowed.
152 int _max_threads; // the most threads we'll allow at once.
153 byte_filer _null_file; // we'll use this for trashing output data.
155 static void handle_OS_signal(int sig_id);
156 // handles break signals from the user.
159 ////////////////////////////////////////////////////////////////////////////
161 class checking_thread : public ethread
164 checking_thread(const link_record &link_info, safe_int_array &bad_lines,
165 marks_checker &checker)
166 : ethread(), _bad_lines(bad_lines), _checker(checker), _info(link_info) {}
168 void perform_activity(void *formal(ptr)) {
170 int ret = _checker.check_link(_info._url, message);
172 astring complaint = a_sprintf("Bad Link at line %d:", _info._uid)
173 += parser_bits::platform_eol_to_chars();
174 const astring spacer(' ', 4);
175 complaint += spacer + _info._url += parser_bits::platform_eol_to_chars();
176 complaint += spacer + _info._description += parser_bits::platform_eol_to_chars();
177 complaint += spacer + "error: " += message;
179 if ( (_info._uid> 100000) || (_info._uid < 0) ) {
180 BASE_LOG(a_sprintf("somehow got bogus line number! %d", _info._uid));
183 _bad_lines.add(_info._uid); // list ours as bad.
188 safe_int_array &_bad_lines;
189 marks_checker &_checker;
193 ////////////////////////////////////////////////////////////////////////////
195 int marks_checker::print_instructions(const filename &program_name)
197 a_sprintf to_show("%s:\n\
198 This program needs three filenames as command line parameters. The -i flag\n\
199 is used to specify the input filename. The -o flag specifies the file where\n\
200 where the good links will be written. The -b flag specifies the file where\n\
201 the bad links are written. The optional flag --no-redirs can be used to\n\
202 disallow web-site redirection, which will catch when the site has changed\n\
203 its location. Note that redirection is not necessarily an error, but it\n\
204 instead may just be a link that needs its URL modified. It is recommended\n\
205 that you omit this flag in early runs, in order to only locate definitely\n\
206 dead links. Then later checking runs can find any sites that were redirected\n\
207 or being routed to a dead link page which doesn't provide an error code.\n\
208 The optional flag --threads with a parameter will set the maximum number of\n\
209 threads that will simultaneously check on links.\n\
210 The input file is expected to be in the HOOPLE link database format.\n\
211 The HOOPLE link format is documented here:\n\
212 http://feistymeow.org/guides/link_database/format_manifesto.txt\n\
213 ", program_name.basename().raw().s(), program_name.basename().raw().s());
214 program_wide_logger::get().log(to_show, ALWAYS_PRINT);
218 // this function just eats any data it's handed.
219 size_t data_sink(void *formal(ptr), size_t size, size_t number, void *formal(stream))
220 { return size * number; }
222 int marks_checker::check_link(const astring &url, astring &error_msg)
226 CURL *cur = curl_easy_init();
228 curl_easy_setopt(cur, CURLOPT_URL, url.s()); // set the URL itself.
230 curl_easy_setopt(cur, CURLOPT_SSL_VERIFYPEER, 0);
231 // don't verify SSL certificates.
232 curl_easy_setopt(cur, CURLOPT_MAXFILESIZE, MAXIMUM_READ);
233 // limit the download size; causes size errors, which we elide to success.
234 curl_easy_setopt(cur, CURLOPT_NOSIGNAL, 1);
235 // don't use signals since it interferes with sleep.
236 curl_easy_setopt(cur, CURLOPT_TIMEOUT, TIME_PER_REQUEST_IN_SEC);
237 // limit time allowed per operation.
238 curl_easy_setopt(cur, CURLOPT_AUTOREFERER, true);
239 // automatically fill in the referer field when redirected.
241 curl_easy_setopt(cur, CURLOPT_WRITEDATA, _null_file.file_handle());
242 // set the file handle where we want our downloaded data to go. since
243 // we're just checking the links, this goes right to the trash.
244 curl_easy_setopt(cur, CURLOPT_WRITEFUNCTION, data_sink);
245 // set the function which will be given all the downloaded data.
247 curl_easy_setopt(cur, CURLOPT_USERAGENT, FAKE_AGENT_STRING);
248 // fake being a browser here since otherwise we get no respect.
250 curl_easy_setopt(cur, CURLOPT_FTPLISTONLY, 1);
251 // get only a simple list of files, which allows us to hit ftp sites
252 // properly. if the normal curl mode is used, we get nothing.
254 if (_check_redirection) {
255 // attempting to quash redirects as being valid.
256 curl_easy_setopt(cur, CURLOPT_FOLLOWLOCATION, 1); // follow redirects.
257 curl_easy_setopt(cur, CURLOPT_MAXREDIRS, 0); // allow zero redirects.
261 while (tries++ < MAXIMUM_ATTEMPTS) {
263 // we do the error message again every time, since it gets shrunk after
264 // the web page check and is no longer available where it was in memory.
265 error_msg = astring(' ', CURL_ERROR_SIZE + 5);
266 curl_easy_setopt(cur, CURLOPT_ERRORBUFFER, error_msg.s());
268 // set the error message buffer so we know what happened.
270 // try to lookup the web page we've been given.
271 to_return = curl_easy_perform(cur);
273 error_msg.shrink(); // just use the message without extra spaces.
275 // we turn file size errors into non-errors, since we have set a very
276 // low file size in order to avoid downloading too much. we really just
277 // want to check links, not download their contents.
278 if (to_return == CURLE_FILESIZE_EXCEEDED) to_return = 0;
281 // supposedly this is a success, but let's check the result code.
283 curl_easy_getinfo(cur, CURLINFO_RESPONSE_CODE, &result);
285 error_msg = a_sprintf("received http failure code %d", result);
288 break; // this was a successful result, a zero outcome from perform.
291 time_control::sleep_ms(10 * SECOND_ms); // give it a few more seconds...
294 curl_easy_cleanup(cur);
299 int marks_checker::test_all_links()
301 FUNCDEF("test_all_links");
302 // traverse the tree in prefix order.
303 tree::iterator itty = _categories.access_root().start(tree::prefix);
304 tree *curr = NULL_POINTER;
305 while ( (curr = itty.next()) ) {
306 inner_mark_tree *nod = dynamic_cast<inner_mark_tree *>(curr);
308 non_continuable_error(static_class_name(), func, "failed to cast a tree node");
309 // iterate on all the links at this node to check them.
310 for (int i = 0; i < nod->_links.elements(); i++) {
311 link_record *lin = nod->_links.borrow(i);
312 if (!lin->_url) continue; // not a link.
314 while (_checkers.threads() > _max_threads) {
315 time_control::sleep_ms(PAUSEY_SNOOZE);
316 _checkers.clean_debris();
319 checking_thread *new_thread = new checking_thread(*lin, _bad_lines,
321 unique_int id = _checkers.add_thread(new_thread, true, NULL_POINTER);
325 BASE_LOG("... finished iterating on tree.");
327 // now wait until all the threads are finished.
328 while (_checkers.threads()) {
329 time_control::sleep_ms(PAUSEY_SNOOZE);
330 _checkers.clean_debris();
333 BASE_LOG("... finished waiting for all threads.");
338 void marks_checker::write_new_files()
340 byte_filer input_file(_input_filename, "r");
341 byte_filer output_file(_output_filename, "w");
342 byte_filer badness_file(_bad_link_filename, "w");
344 basis::int_array badness = _bad_lines.make_copy();
345 shell_sort<int>(badness.access(), badness.length());
347 BASE_LOG("bad links are on lines:");
349 for (int i = 0; i < badness.length(); i++) {
350 bad_list += a_sprintf("%d, ", badness[i]);
356 while (!input_file.eof()) {
358 while (badness.length() && (badness[0] < curr_line) ) {
359 BASE_LOG(a_sprintf("whacking too low line number: %d", badness[0]));
362 input_file.getline(buffer, 2048);
363 //make that a constant.
364 if (badness.length() && (badness[0] == curr_line)) {
365 // we seem to have found a bad line.
366 badness_file.write(buffer);
367 badness.zap(0, 0); // remove the current line number.
369 // this is a healthy line.
370 output_file.write(buffer);
376 badness_file.close();
379 marks_checker *main_program = NULL_POINTER;
381 void marks_checker::handle_OS_signal(int formal(sig_id))
383 signal(SIGINT, SIG_IGN); // turn off that signal for now.
384 BASE_LOG("caught break signal... now writing files.");
385 if (main_program) main_program->write_new_files();
386 BASE_LOG("exiting after handling break.");
387 main_program = NULL_POINTER;
391 int marks_checker::execute()
396 main_program = this; // used by our signal handler.
398 command_line cmds(_global_argc, _global_argv); // process the command line parameters.
399 if (!cmds.get_value('i', _input_filename, false))
400 return print_instructions(cmds.program_name());
401 if (!cmds.get_value('o', _output_filename, false))
402 return print_instructions(cmds.program_name());
403 if (!cmds.get_value('b', _bad_link_filename, false))
404 return print_instructions(cmds.program_name());
408 // optional flag for checking website redirection.
409 if (cmds.get_value("no-redirs", temp, false)) {
410 BASE_LOG("Enabling redirection checking: redirected web sites are reported as bad.");
411 _check_redirection = true;
413 // optional flag for number of threads.
415 if (cmds.get_value("threads", threads, false)) {
416 _max_threads = threads.convert(0);
417 BASE_LOG(a_sprintf("Maximum threads allowed=%d", _max_threads));
420 BASE_LOG(astring("input file: ") + _input_filename);
421 BASE_LOG(astring("output file: ") + _output_filename);
422 BASE_LOG(astring("bad link file: ") + _bad_link_filename);
424 //hmmm: check if output file already exists.
425 //hmmm: check if bad file already exists.
427 LOG("before reading input...");
429 int ret = _categories.read_csv_file(_input_filename);
430 if (ret) return ret; // failure during read means we can't do much.
432 LOG("after reading input...");
434 signal(SIGINT, handle_OS_signal);
435 // hook the break signal so we can still do part of the job if they
438 curl_global_init(CURL_GLOBAL_ALL); // crank up the cURL engine.
440 ret = test_all_links();
443 main_program = NULL_POINTER;
445 curl_global_cleanup(); // shut down cURL engine again.
450 ////////////////////////////////////////////////////////////////////////////
452 HOOPLE_MAIN(marks_checker, )