61#define BASE_LOG(s) program_wide_logger::get().log(astring(s), ALWAYS_PRINT)
63#define LOG(s) CLASS_EMERGENCY_LOG(program_wide_logger::get(), \
64 a_sprintf("line %d: ", _categories._line_number) + s)
87 "en-US; rv:1.8.19) Flecko/20081031";
98 safe_int_array() : _lock(), _list(0) {}
100 void add(
int to_add) {
108 return _list.length();
135 int test_all_links();
142 void write_new_files();
147 safe_int_array _bad_lines;
152 bool _check_redirection;
156 static void handle_OS_signal(
int sig_id);
162class checking_thread :
public ethread
165 checking_thread(
const link_record &link_info, safe_int_array &bad_lines,
166 marks_checker &checker)
167 :
ethread(), _bad_lines(bad_lines), _checker(checker), _info(link_info) {}
171 int ret = _checker.check_link(_info._url, message);
178 complaint += spacer +
"error: " += message;
180if ( (_info._uid> 100000) || (_info._uid < 0) ) {
184 _bad_lines.add(_info._uid);
189 safe_int_array &_bad_lines;
190 marks_checker &_checker;
196int marks_checker::print_instructions(
const filename &program_name)
199This program needs three filenames as command line parameters. The -i flag\n\
200is used to specify the input filename. The -o flag specifies the file where\n\
201where the good links will be written. The -b flag specifies the file where\n\
202the bad links are written. The optional flag --no-redirs can be used to\n\
203disallow web-site redirection, which will catch when the site has changed\n\
204its location. Note that redirection is not necessarily an error, but it\n\
205instead may just be a link that needs its URL modified. It is recommended\n\
206that you omit this flag in early runs, in order to only locate definitely\n\
207dead links. Then later checking runs can find any sites that were redirected\n\
208or being routed to a dead link page which doesn't provide an error code.\n\
209The optional flag --threads with a parameter will set the maximum number of\n\
210threads that will simultaneously check on links.\n\
211The input file is expected to be in the HOOPLE link database format.\n\
212The HOOPLE link format is documented here:\n\
213 http://feistymeow.org/guides/link_database/format_manifesto.txt\n\
221{
return size * number; }
223int marks_checker::check_link(
const astring &url,
astring &error_msg)
227 CURL *cur = curl_easy_init();
229 curl_easy_setopt(cur, CURLOPT_URL, url.
s());
231 curl_easy_setopt(cur, CURLOPT_SSL_VERIFYPEER, 0);
233 curl_easy_setopt(cur, CURLOPT_MAXFILESIZE,
MAXIMUM_READ);
235 curl_easy_setopt(cur, CURLOPT_NOSIGNAL, 1);
239 curl_easy_setopt(cur, CURLOPT_AUTOREFERER,
true);
242 curl_easy_setopt(cur, CURLOPT_WRITEDATA, _null_file.file_handle());
245 curl_easy_setopt(cur, CURLOPT_WRITEFUNCTION,
data_sink);
251 curl_easy_setopt(cur, CURLOPT_FTPLISTONLY, 1);
255 if (_check_redirection) {
257 curl_easy_setopt(cur, CURLOPT_FOLLOWLOCATION, 1);
258 curl_easy_setopt(cur, CURLOPT_MAXREDIRS, 0);
266 error_msg =
astring(
' ', CURL_ERROR_SIZE + 5);
267 curl_easy_setopt(cur, CURLOPT_ERRORBUFFER, error_msg.
s());
272 to_return = curl_easy_perform(cur);
279 if (to_return == CURLE_FILESIZE_EXCEEDED) to_return = 0;
284 curl_easy_getinfo(cur, CURLINFO_RESPONSE_CODE, &result);
286 error_msg =
a_sprintf(
"received http failure code %d", result);
295 curl_easy_cleanup(cur);
300int marks_checker::test_all_links()
306 while ( (curr = itty.
next()) ) {
313 if (!lin->
_url)
continue;
315 while (_checkers.threads() > _max_threads) {
317 _checkers.clean_debris();
320 checking_thread *new_thread =
new checking_thread(*lin, _bad_lines,
326BASE_LOG(
"... finished iterating on tree.");
329 while (_checkers.threads()) {
331 _checkers.clean_debris();
334BASE_LOG(
"... finished waiting for all threads.");
339void marks_checker::write_new_files()
342 byte_filer output_file(_output_filename,
"w");
343 byte_filer badness_file(_bad_link_filename,
"w");
348 BASE_LOG(
"bad links are on lines:");
350 for (
int i = 0; i < badness.
length(); i++) {
351 bad_list +=
a_sprintf(
"%d, ", badness[i]);
357 while (!input_file.eof()) {
359 while (badness.
length() && (badness[0] < curr_line) ) {
363 input_file.getline(buffer, 2048);
365 if (badness.
length() && (badness[0] == curr_line)) {
367 badness_file.write(buffer);
371 output_file.write(buffer);
377 badness_file.close();
382void marks_checker::handle_OS_signal(
int formal(sig_id))
384 signal(SIGINT, SIG_IGN);
385 BASE_LOG(
"caught break signal... now writing files.");
387 BASE_LOG(
"exiting after handling break.");
392int marks_checker::execute()
400 if (!cmds.get_value(
'i', _input_filename,
false))
402 if (!cmds.get_value(
'o', _output_filename,
false))
404 if (!cmds.get_value(
'b', _bad_link_filename,
false))
410 if (cmds.get_value(
"no-redirs", temp,
false)) {
411 BASE_LOG(
"Enabling redirection checking: redirected web sites are reported as bad.");
412 _check_redirection =
true;
416 if (cmds.get_value(
"threads", threads,
false)) {
417 _max_threads = threads.
convert(0);
428LOG(
"before reading input...");
430 int ret = _categories.read_csv_file(_input_filename);
433LOG(
"after reading input...");
435 signal(SIGINT, handle_OS_signal);
439 curl_global_init(CURL_GLOBAL_ALL);
441 ret = test_all_links();
446 curl_global_cleanup();
int print_instructions(bool good, const astring &program_name)
The application_shell is a base object for console programs.
virtual int execute()=0
< retrieves the command line from the /proc hierarchy on linux.
a_sprintf is a specialization of astring that provides printf style support.
contents * access()
A non-constant access of the underlying C-array. BE REALLY CAREFUL.
int length() const
Returns the current reported length of the allocated C array.
outcome zap(int start, int end)
Deletes from "this" the objects inclusively between "start" and "end".
Provides a dynamically resizable ASCII character string.
const char * s() const
synonym for observe. the 's' stands for "string", if that helps.
int convert(int default_value) const
Converts the string into a corresponding integer.
void shrink()
changes all occurrences of "to_replace" into "new_string".
auto_synchronizer simplifies concurrent code by automatically unlocking.
virtual outcome log(const base_string &info, int filter)=0
writes the information in "info" to the logger using the "filter".
A simple object that wraps a templated array of ints.
Provides file managment services using the standard I/O support.
Provides operations commonly needed on file names.
const basis::astring & raw() const
returns the astring that we're holding onto for the path.
filename basename() const
returns the base of the filename; no directory.
static loggers::standard_log_base & get()
Provided by the startup code within each application for logging.
tree * next()
Returns a pointer to the next tree in the direction of traversal.
A dynamically linked tree with an arbitrary number of branches.
Provides a platform-independent object for adding threads to a program.
virtual void perform_activity(void *thread_data)=0
< invoked just after after start(), when the OS thread is created.
Manages a collection of threads.
int elements() const
the maximum number of elements currently allowed in this amorph.
contents * borrow(int field)
Returns a pointer to the information at the index "field".
A unique identifier based on integers.
static const char * platform_eol_to_chars()
provides the characters that make up this platform's line ending.
static void sleep_ms(basis::un_int msec)
a system independent name for a forced snooze measured in milliseconds.
#define SETUP_COMBO_LOGGER
a macro that retasks the program-wide logger as a combo_logger.
#define non_continuable_error(c, f, i)
an extra piece of information used, if available, in bounds_halt below.
#define formal(parameter)
This macro just eats what it's passed; it marks unused formal parameters.
#define NULL_POINTER
The value representing a pointer to nothing.
#define DEFINE_CLASS_NAME(objname)
Defines the name of a class by providing a couple standard methods.
#define FUNCDEF(func_in)
FUNCDEF sets the name of a function (and plugs it into the callstack).
Provides macros that implement the 'main' program of an application.
#define HOOPLE_MAIN(obj_name, obj_args)
options that should work for most unix and linux apps.
size_t data_sink(void *formal(ptr), size_t size, size_t number, void *formal(stream))
const int MAXIMUM_ATTEMPTS
const char * FAKE_AGENT_STRING
marks_checker * main_program
const int TIME_PER_REQUEST_IN_SEC
const int MAXIMUM_THREADS
Implements an application lock to ensure only one is running at once.
The guards collection helps in testing preconditions and reporting errors.
const int SECOND_ms
Number of milliseconds in a second.
A platform independent way to obtain the timestamp of a file.
A logger that sends to the console screen using the standard output device.
An extension to floating point primitives providing approximate equality.
A dynamic container class that holds any kind of object via pointers.
#define static_class_name()
Aids in achievement of platform independence.