feisty meow concerns codebase  2.140
marks_checker.cpp
Go to the documentation of this file.
1 /*****************************************************************************\
2 * *
3 * Name : marks_checker *
4 * Author : Chris Koeritz *
5 * *
6 * Purpose: *
7 * *
8 * Checks on the existence of the links listed in a HOOPLE format link *
9 * database and reports the bad ones. *
10 * *
11 *******************************************************************************
12 * Copyright (c) 2005-$now By Author. This program is free software; you can *
13 * redistribute it and/or modify it under the terms of the GNU General Public *
14 * License as published by the Free Software Foundation; either version 2 of *
15 * the License or (at your option) any later version. This is online at: *
16 * http://www.fsf.org/copyleft/gpl.html *
17 * Please send any updates to: fred@gruntose.com *
18 \*****************************************************************************/
19 
20 #include "bookmark_tree.h"
21 
22 #include <algorithms/sorts.h>
26 #include <basis/astring.h>
27 #include <basis/functions.h>
28 #include <basis/guards.h>
29 #include <basis/mutex.h>
30 #include <filesystem/byte_filer.h>
31 #include <filesystem/filename.h>
32 #include <loggers/file_logger.h>
33 #include <mathematics/chaos.h>
34 #include <processes/ethread.h>
37 #include <structures/unique_id.h>
38 #include <textual/parser_bits.h>
39 #include <timely/time_control.h>
40 
41 #include <curl/curl.h>
42 #include <signal.h>
43 #include <stdlib.h>
44 
45 using namespace algorithms;
46 using namespace application;
47 using namespace basis;
48 using namespace filesystem;
49 using namespace loggers;
50 using namespace nodes;
51 using namespace mathematics;
52 using namespace processes;
53 using namespace structures;
54 using namespace textual;
55 using namespace timely;
56 
57 //#define DEBUG_MARKS
58  // uncomment to have more debugging noise.
59 
60 #undef BASE_LOG
61 #define BASE_LOG(s) program_wide_logger::get().log(astring(s), ALWAYS_PRINT)
62 #undef LOG
63 #define LOG(s) CLASS_EMERGENCY_LOG(program_wide_logger::get(), \
64  a_sprintf("line %d: ", _categories._line_number) + s)
65 
66 const int PAUSEY_SNOOZE = 200;
67  // how long we sleep if there are too many threads running already.
68 
69 const int MAXIMUM_THREADS = 14;
70  // we allow this many simultaneous web checks at a time.
71 
72 const int MAXIMUM_READ = 1008;
73  // we only download this much of the link. this avoids huge downloads of
74  // very large sites.
75 
76 const int MAXIMUM_ATTEMPTS = 2;
77  // we'll retry the check if we get an actual error instead of an http error
78  // code. when a name can't be found in the DNS, it sometimes comes back
79  // shortly after it was checked. if we see we can't reach the domain after
80  // this many tries, then we give up on the address.
81 
82 const int TIME_PER_REQUEST_IN_SEC = 60 * 6;
83  // limit our requests to this long of a period. then we will not be
84  // stalled forever by uncooperative websites.
85 
86 const char *FAKE_AGENT_STRING = "FredWeb/7.0 (X11; U; Linux i686; "
87  "en-US; rv:1.8.19) Flecko/20081031";
88  // we use this as our agent type, since some sites won't treat us fairly
89  // if they think we're robots when we're checking their site health.
90 //still true?
91  // for example (ahem!), the usa today websites.
92 
94 
95 class safe_int_array
96 {
97 public:
98  safe_int_array() : _lock(), _list(0) {}
99 
100  void add(int to_add) {
102  auto_synchronizer l(_lock);
103  _list += to_add;
104  }
105 
106  int length() {
107  auto_synchronizer l(_lock);
108  return _list.length();
109  }
110 
111  basis::int_array make_copy() {
112  auto_synchronizer l(_lock);
113  return _list;
114  }
115 
116 private:
117  basis::mutex _lock;
118  basis::int_array _list;
119 };
120 
122 
123 class marks_checker : public application_shell
124 {
125 public:
126  marks_checker()
127  : application_shell(), _check_redirection(false),
128  _max_threads(MAXIMUM_THREADS), _null_file(filename::null_device(), "w")
129  {}
130 
131  DEFINE_CLASS_NAME("marks_checker");
132  virtual int execute();
133  int print_instructions(const filename &program_name);
134 
135  int test_all_links();
136  // goes through the tree of links and tests them all.
137 
138  int check_link(const astring &url, astring &error_msg);
139  // synchronously checks the "url" for health. the return value is zero
140  // on success or an HTTP error code on failure.
141 
142  void write_new_files();
143  // writes out the two new files given the info accumulated so far.
144 
145 private:
146  bookmark_tree _categories; // our tree of categories.
147  safe_int_array _bad_lines; // lines with bad contents.
148  thread_cabinet _checkers; // threads checking on links.
149  astring _input_filename; // we'll store our link database name here.
150  astring _output_filename; // where the list of good links is stored.
151  astring _bad_link_filename; // garbage dump of bad links.
152  bool _check_redirection; // true if redirection is disallowed.
153  int _max_threads; // the most threads we'll allow at once.
154  byte_filer _null_file; // we'll use this for trashing output data.
155 
156  static void handle_OS_signal(int sig_id);
157  // handles break signals from the user.
158 };
159 
161 
162 class checking_thread : public ethread
163 {
164 public:
165  checking_thread(const link_record &link_info, safe_int_array &bad_lines,
166  marks_checker &checker)
167  : ethread(), _bad_lines(bad_lines), _checker(checker), _info(link_info) {}
168 
169  void perform_activity(void *formal(ptr)) {
170  astring message;
171  int ret = _checker.check_link(_info._url, message);
172  if (ret != 0) {
173  astring complaint = a_sprintf("Bad Link at line %d:", _info._uid)
174  += parser_bits::platform_eol_to_chars();
175  const astring spacer(' ', 4);
176  complaint += spacer + _info._url += parser_bits::platform_eol_to_chars();
177  complaint += spacer + _info._description += parser_bits::platform_eol_to_chars();
178  complaint += spacer + "error: " += message;
179  BASE_LOG(complaint);
180 if ( (_info._uid> 100000) || (_info._uid < 0) ) {
181 BASE_LOG(a_sprintf("somehow got bogus line number! %d", _info._uid));
182 return;
183 }
184  _bad_lines.add(_info._uid); // list ours as bad.
185  }
186  }
187 
188 private:
189  safe_int_array &_bad_lines;
190  marks_checker &_checker;
191  link_record _info;
192 };
193 
195 
196 int marks_checker::print_instructions(const filename &program_name)
197 {
198  a_sprintf to_show("%s:\n\
199 This program needs three filenames as command line parameters. The -i flag\n\
200 is used to specify the input filename. The -o flag specifies the file where\n\
201 where the good links will be written. The -b flag specifies the file where\n\
202 the bad links are written. The optional flag --no-redirs can be used to\n\
203 disallow web-site redirection, which will catch when the site has changed\n\
204 its location. Note that redirection is not necessarily an error, but it\n\
205 instead may just be a link that needs its URL modified. It is recommended\n\
206 that you omit this flag in early runs, in order to only locate definitely\n\
207 dead links. Then later checking runs can find any sites that were redirected\n\
208 or being routed to a dead link page which doesn't provide an error code.\n\
209 The optional flag --threads with a parameter will set the maximum number of\n\
210 threads that will simultaneously check on links.\n\
211 The input file is expected to be in the HOOPLE link database format.\n\
212 The HOOPLE link format is documented here:\n\
213  http://feistymeow.org/guides/link_database/format_manifesto.txt\n\
214 ", program_name.basename().raw().s(), program_name.basename().raw().s());
215  program_wide_logger::get().log(to_show, ALWAYS_PRINT);
216  return 12;
217 }
218 
219 // this function just eats any data it's handed.
220 size_t data_sink(void *formal(ptr), size_t size, size_t number, void *formal(stream))
221 { return size * number; }
222 
223 int marks_checker::check_link(const astring &url, astring &error_msg)
224 {
225  int to_return = -1;
226 
227  CURL *cur = curl_easy_init();
228 
229  curl_easy_setopt(cur, CURLOPT_URL, url.s()); // set the URL itself.
230 
231  curl_easy_setopt(cur, CURLOPT_SSL_VERIFYPEER, 0);
232  // don't verify SSL certificates.
233  curl_easy_setopt(cur, CURLOPT_MAXFILESIZE, MAXIMUM_READ);
234  // limit the download size; causes size errors, which we elide to success.
235  curl_easy_setopt(cur, CURLOPT_NOSIGNAL, 1);
236  // don't use signals since it interferes with sleep.
237  curl_easy_setopt(cur, CURLOPT_TIMEOUT, TIME_PER_REQUEST_IN_SEC);
238  // limit time allowed per operation.
239  curl_easy_setopt(cur, CURLOPT_AUTOREFERER, true);
240  // automatically fill in the referer field when redirected.
241 
242  curl_easy_setopt(cur, CURLOPT_WRITEDATA, _null_file.file_handle());
243  // set the file handle where we want our downloaded data to go. since
244  // we're just checking the links, this goes right to the trash.
245  curl_easy_setopt(cur, CURLOPT_WRITEFUNCTION, data_sink);
246  // set the function which will be given all the downloaded data.
247 
248  curl_easy_setopt(cur, CURLOPT_USERAGENT, FAKE_AGENT_STRING);
249  // fake being a browser here since otherwise we get no respect.
250 
251  curl_easy_setopt(cur, CURLOPT_FTPLISTONLY, 1);
252  // get only a simple list of files, which allows us to hit ftp sites
253  // properly. if the normal curl mode is used, we get nothing.
254 
255  if (_check_redirection) {
256  // attempting to quash redirects as being valid.
257  curl_easy_setopt(cur, CURLOPT_FOLLOWLOCATION, 1); // follow redirects.
258  curl_easy_setopt(cur, CURLOPT_MAXREDIRS, 0); // allow zero redirects.
259  }
260 
261  int tries = 0;
262  while (tries++ < MAXIMUM_ATTEMPTS) {
263 
264  // we do the error message again every time, since it gets shrunk after
265  // the web page check and is no longer available where it was in memory.
266  error_msg = astring(' ', CURL_ERROR_SIZE + 5);
267  curl_easy_setopt(cur, CURLOPT_ERRORBUFFER, error_msg.s());
268 
269  // set the error message buffer so we know what happened.
270 
271  // try to lookup the web page we've been given.
272  to_return = curl_easy_perform(cur);
273 
274  error_msg.shrink(); // just use the message without extra spaces.
275 
276  // we turn file size errors into non-errors, since we have set a very
277  // low file size in order to avoid downloading too much. we really just
278  // want to check links, not download their contents.
279  if (to_return == CURLE_FILESIZE_EXCEEDED) to_return = 0;
280 
281  if (!to_return) {
282  // supposedly this is a success, but let's check the result code.
283  long result;
284  curl_easy_getinfo(cur, CURLINFO_RESPONSE_CODE, &result);
285  if (result >= 400) {
286  error_msg = a_sprintf("received http failure code %d", result);
287  to_return = -1;
288  }
289  break; // this was a successful result, a zero outcome from perform.
290  }
291 
292  time_control::sleep_ms(10 * SECOND_ms); // give it a few more seconds...
293  }
294 
295  curl_easy_cleanup(cur);
296 
297  return to_return;
298 }
299 
300 int marks_checker::test_all_links()
301 {
302  FUNCDEF("test_all_links");
303  // traverse the tree in prefix order.
304  tree::iterator itty = _categories.access_root().start(tree::prefix);
305  tree *curr = NULL_POINTER;
306  while ( (curr = itty.next()) ) {
307  inner_mark_tree *nod = dynamic_cast<inner_mark_tree *>(curr);
308  if (!nod)
309  non_continuable_error(static_class_name(), func, "failed to cast a tree node");
310  // iterate on all the links at this node to check them.
311  for (int i = 0; i < nod->_links.elements(); i++) {
312  link_record *lin = nod->_links.borrow(i);
313  if (!lin->_url) continue; // not a link.
314 
315  while (_checkers.threads() > _max_threads) {
316  time_control::sleep_ms(PAUSEY_SNOOZE);
317  _checkers.clean_debris();
318  }
319 
320  checking_thread *new_thread = new checking_thread(*lin, _bad_lines,
321  *this);
322  unique_int id = _checkers.add_thread(new_thread, true, NULL_POINTER);
323  }
324  }
325 
326 BASE_LOG("... finished iterating on tree.");
327 
328  // now wait until all the threads are finished.
329  while (_checkers.threads()) {
330  time_control::sleep_ms(PAUSEY_SNOOZE);
331  _checkers.clean_debris();
332  }
333 
334 BASE_LOG("... finished waiting for all threads.");
335 
336  return 0;
337 }
338 
339 void marks_checker::write_new_files()
340 {
341  byte_filer input_file(_input_filename, "r");
342  byte_filer output_file(_output_filename, "w");
343  byte_filer badness_file(_bad_link_filename, "w");
344 
345  basis::int_array badness = _bad_lines.make_copy();
346  shell_sort<int>(badness.access(), badness.length());
347 
348  BASE_LOG("bad links are on lines:");
349  astring bad_list;
350  for (int i = 0; i < badness.length(); i++) {
351  bad_list += a_sprintf("%d, ", badness[i]);
352  }
353  BASE_LOG(bad_list);
354 
355  astring buffer;
356  int curr_line = 0;
357  while (!input_file.eof()) {
358  curr_line++;
359  while (badness.length() && (badness[0] < curr_line) ) {
360  BASE_LOG(a_sprintf("whacking too low line number: %d", badness[0]));
361  badness.zap(0, 0);
362  }
363  input_file.getline(buffer, 2048);
364 //make that a constant.
365  if (badness.length() && (badness[0] == curr_line)) {
366  // we seem to have found a bad line.
367  badness_file.write(buffer);
368  badness.zap(0, 0); // remove the current line number.
369  } else {
370  // this is a healthy line.
371  output_file.write(buffer);
372  }
373 
374  }
375  input_file.close();
376  output_file.close();
377  badness_file.close();
378 }
379 
380 marks_checker *main_program = NULL_POINTER;
381 
382 void marks_checker::handle_OS_signal(int formal(sig_id))
383 {
384  signal(SIGINT, SIG_IGN); // turn off that signal for now.
385  BASE_LOG("caught break signal... now writing files.");
386  if (main_program) main_program->write_new_files();
387  BASE_LOG("exiting after handling break.");
389  exit(0);
390 }
391 
392 int marks_checker::execute()
393 {
394  FUNCDEF("execute");
396 
397  main_program = this; // used by our signal handler.
398 
399  command_line cmds(_global_argc, _global_argv); // process the command line parameters.
400  if (!cmds.get_value('i', _input_filename, false))
401  return print_instructions(cmds.program_name());
402  if (!cmds.get_value('o', _output_filename, false))
403  return print_instructions(cmds.program_name());
404  if (!cmds.get_value('b', _bad_link_filename, false))
405  return print_instructions(cmds.program_name());
406 
407  astring temp;
408 
409  // optional flag for checking website redirection.
410  if (cmds.get_value("no-redirs", temp, false)) {
411  BASE_LOG("Enabling redirection checking: redirected web sites are reported as bad.");
412  _check_redirection = true;
413  }
414  // optional flag for number of threads.
415  astring threads;
416  if (cmds.get_value("threads", threads, false)) {
417  _max_threads = threads.convert(0);
418  BASE_LOG(a_sprintf("Maximum threads allowed=%d", _max_threads));
419  }
420 
421  BASE_LOG(astring("input file: ") + _input_filename);
422  BASE_LOG(astring("output file: ") + _output_filename);
423  BASE_LOG(astring("bad link file: ") + _bad_link_filename);
424 
425 //hmmm: check if output file already exists.
426 //hmmm: check if bad file already exists.
427 
428 LOG("before reading input...");
429 
430  int ret = _categories.read_csv_file(_input_filename);
431  if (ret) return ret; // failure during read means we can't do much.
432 
433 LOG("after reading input...");
434 
435  signal(SIGINT, handle_OS_signal);
436  // hook the break signal so we can still do part of the job if they
437  // interrupt us.
438 
439  curl_global_init(CURL_GLOBAL_ALL); // crank up the cURL engine.
440 
441  ret = test_all_links();
442 
443  write_new_files();
445 
446  curl_global_cleanup(); // shut down cURL engine again.
447 
448  return 0;
449 }
450 
452 
453 HOOPLE_MAIN(marks_checker, )
454 
int print_instructions(bool good, const astring &program_name)
Definition: checker.cpp:45
The application_shell is a base object for console programs.
a_sprintf is a specialization of astring that provides printf style support.
Definition: astring.h:440
contents * access()
A non-constant access of the underlying C-array. BE REALLY CAREFUL.
Definition: array.h:175
int length() const
Returns the current reported length of the allocated C array.
Definition: array.h:115
outcome zap(int start, int end)
Deletes from "this" the objects inclusively between "start" and "end".
Definition: array.h:769
Provides a dynamically resizable ASCII character string.
Definition: astring.h:35
const char * s() const
synonym for observe. the 's' stands for "string", if that helps.
Definition: astring.h:113
int convert(int default_value) const
Converts the string into a corresponding integer.
Definition: astring.cpp:757
void shrink()
changes all occurrences of "to_replace" into "new_string".
Definition: astring.cpp:168
auto_synchronizer simplifies concurrent code by automatically unlocking.
Definition: mutex.h:113
A simple object that wraps a templated array of ints.
Definition: array.h:275
Provides file managment services using the standard I/O support.
Definition: byte_filer.h:32
Provides operations commonly needed on file names.
Definition: filename.h:64
const basis::astring & raw() const
returns the astring that we're holding onto for the path.
Definition: filename.cpp:97
filename basename() const
returns the base of the filename; no directory.
Definition: filename.cpp:385
listo_links _links
tree * next()
Returns a pointer to the next tree in the direction of traversal.
Definition: tree.cpp:257
A dynamically linked tree with an arbitrary number of branches.
Definition: tree.h:40
Provides a platform-independent object for adding threads to a program.
Definition: ethread.h:36
Manages a collection of threads.
int elements() const
the maximum number of elements currently allowed in this amorph.
Definition: amorph.h:66
contents * borrow(int field)
Returns a pointer to the information at the index "field".
Definition: amorph.h:448
A unique identifier based on integers.
Definition: unique_id.h:97
#define SETUP_COMBO_LOGGER
a macro that retasks the program-wide logger as a combo_logger.
Definition: combo_logger.h:49
#define non_continuable_error(c, f, i)
an extra piece of information used, if available, in bounds_halt below.
#define formal(parameter)
This macro just eats what it's passed; it marks unused formal parameters.
Definition: definitions.h:48
#define NULL_POINTER
The value representing a pointer to nothing.
Definition: definitions.h:32
#define DEFINE_CLASS_NAME(objname)
Defines the name of a class by providing a couple standard methods.
Definition: enhance_cpp.h:45
#define FUNCDEF(func_in)
FUNCDEF sets the name of a function (and plugs it into the callstack).
Definition: enhance_cpp.h:57
Provides macros that implement the 'main' program of an application.
#define HOOPLE_MAIN(obj_name, obj_args)
options that should work for most unix and linux apps.
Definition: hoople_main.h:61
const int MAXIMUM_READ
size_t data_sink(void *formal(ptr), size_t size, size_t number, void *formal(stream))
#define LOG(s)
const int MAXIMUM_ATTEMPTS
#define BASE_LOG(s)
const char * FAKE_AGENT_STRING
const int PAUSEY_SNOOZE
marks_checker * main_program
const int TIME_PER_REQUEST_IN_SEC
const int MAXIMUM_THREADS
Implements an application lock to ensure only one is running at once.
char ** _global_argv
The guards collection helps in testing preconditions and reporting errors.
Definition: array.h:30
const int SECOND_ms
Number of milliseconds in a second.
Definition: definitions.h:120
A platform independent way to obtain the timestamp of a file.
Definition: byte_filer.cpp:37
A logger that sends to the console screen using the standard output device.
An extension to floating point primitives providing approximate equality.
Definition: averager.h:21
A dynamic container class that holds any kind of object via pointers.
Definition: amorph.h:55
#include <time.h>
Definition: earth_time.cpp:37
#define static_class_name()
Aids in achievement of platform independence.