41 #include <curl/curl.h>
47 using namespace basis;
50 using namespace nodes;
61 #define BASE_LOG(s) program_wide_logger::get().log(astring(s), ALWAYS_PRINT)
63 #define LOG(s) CLASS_EMERGENCY_LOG(program_wide_logger::get(), \
64 a_sprintf("line %d: ", _categories._line_number) + s)
87 "en-US; rv:1.8.19) Flecko/20081031";
98 safe_int_array() : _lock(), _list(0) {}
100 void add(
int to_add) {
108 return _list.length();
132 virtual int execute();
135 int test_all_links();
142 void write_new_files();
147 safe_int_array _bad_lines;
152 bool _check_redirection;
156 static void handle_OS_signal(
int sig_id);
162 class checking_thread :
public ethread
165 checking_thread(
const link_record &link_info, safe_int_array &bad_lines,
166 marks_checker &checker)
167 :
ethread(), _bad_lines(bad_lines), _checker(checker), _info(link_info) {}
169 void perform_activity(
void *
formal(ptr)) {
171 int ret = _checker.check_link(_info._url, message);
174 += parser_bits::platform_eol_to_chars();
176 complaint += spacer + _info._url += parser_bits::platform_eol_to_chars();
177 complaint += spacer + _info._description += parser_bits::platform_eol_to_chars();
178 complaint += spacer +
"error: " += message;
180 if ( (_info._uid> 100000) || (_info._uid < 0) ) {
184 _bad_lines.add(_info._uid);
189 safe_int_array &_bad_lines;
190 marks_checker &_checker;
199 This program needs three filenames as command line parameters. The -i flag\n\
200 is used to specify the input filename. The -o flag specifies the file where\n\
201 where the good links will be written. The -b flag specifies the file where\n\
202 the bad links are written. The optional flag --no-redirs can be used to\n\
203 disallow web-site redirection, which will catch when the site has changed\n\
204 its location. Note that redirection is not necessarily an error, but it\n\
205 instead may just be a link that needs its URL modified. It is recommended\n\
206 that you omit this flag in early runs, in order to only locate definitely\n\
207 dead links. Then later checking runs can find any sites that were redirected\n\
208 or being routed to a dead link page which doesn't provide an error code.\n\
209 The optional flag --threads with a parameter will set the maximum number of\n\
210 threads that will simultaneously check on links.\n\
211 The input file is expected to be in the HOOPLE link database format.\n\
212 The HOOPLE link format is documented here:\n\
213 http://feistymeow.org/guides/link_database/format_manifesto.txt\n\
215 program_wide_logger::get().log(to_show, ALWAYS_PRINT);
221 {
return size * number; }
223 int marks_checker::check_link(
const astring &url,
astring &error_msg)
227 CURL *cur = curl_easy_init();
229 curl_easy_setopt(cur, CURLOPT_URL, url.
s());
231 curl_easy_setopt(cur, CURLOPT_SSL_VERIFYPEER, 0);
233 curl_easy_setopt(cur, CURLOPT_MAXFILESIZE,
MAXIMUM_READ);
235 curl_easy_setopt(cur, CURLOPT_NOSIGNAL, 1);
239 curl_easy_setopt(cur, CURLOPT_AUTOREFERER,
true);
242 curl_easy_setopt(cur, CURLOPT_WRITEDATA, _null_file.file_handle());
245 curl_easy_setopt(cur, CURLOPT_WRITEFUNCTION,
data_sink);
251 curl_easy_setopt(cur, CURLOPT_FTPLISTONLY, 1);
255 if (_check_redirection) {
257 curl_easy_setopt(cur, CURLOPT_FOLLOWLOCATION, 1);
258 curl_easy_setopt(cur, CURLOPT_MAXREDIRS, 0);
266 error_msg =
astring(
' ', CURL_ERROR_SIZE + 5);
267 curl_easy_setopt(cur, CURLOPT_ERRORBUFFER, error_msg.
s());
272 to_return = curl_easy_perform(cur);
279 if (to_return == CURLE_FILESIZE_EXCEEDED) to_return = 0;
284 curl_easy_getinfo(cur, CURLINFO_RESPONSE_CODE, &result);
286 error_msg =
a_sprintf(
"received http failure code %d", result);
295 curl_easy_cleanup(cur);
300 int marks_checker::test_all_links()
304 tree::iterator itty = _categories.access_root().start(tree::prefix);
306 while ( (curr = itty.
next()) ) {
313 if (!lin->
_url)
continue;
315 while (_checkers.threads() > _max_threads) {
317 _checkers.clean_debris();
320 checking_thread *new_thread =
new checking_thread(*lin, _bad_lines,
326 BASE_LOG(
"... finished iterating on tree.");
329 while (_checkers.threads()) {
331 _checkers.clean_debris();
334 BASE_LOG(
"... finished waiting for all threads.");
339 void marks_checker::write_new_files()
342 byte_filer output_file(_output_filename,
"w");
343 byte_filer badness_file(_bad_link_filename,
"w");
348 BASE_LOG(
"bad links are on lines:");
350 for (
int i = 0; i < badness.
length(); i++) {
351 bad_list +=
a_sprintf(
"%d, ", badness[i]);
357 while (!input_file.eof()) {
359 while (badness.
length() && (badness[0] < curr_line) ) {
363 input_file.getline(buffer, 2048);
365 if (badness.
length() && (badness[0] == curr_line)) {
367 badness_file.write(buffer);
371 output_file.write(buffer);
377 badness_file.close();
382 void marks_checker::handle_OS_signal(
int formal(sig_id))
384 signal(SIGINT, SIG_IGN);
385 BASE_LOG(
"caught break signal... now writing files.");
387 BASE_LOG(
"exiting after handling break.");
392 int marks_checker::execute()
400 if (!cmds.get_value(
'i', _input_filename,
false))
402 if (!cmds.get_value(
'o', _output_filename,
false))
404 if (!cmds.get_value(
'b', _bad_link_filename,
false))
410 if (cmds.get_value(
"no-redirs", temp,
false)) {
411 BASE_LOG(
"Enabling redirection checking: redirected web sites are reported as bad.");
412 _check_redirection =
true;
416 if (cmds.get_value(
"threads", threads,
false)) {
417 _max_threads = threads.
convert(0);
428 LOG(
"before reading input...");
430 int ret = _categories.read_csv_file(_input_filename);
433 LOG(
"after reading input...");
435 signal(SIGINT, handle_OS_signal);
439 curl_global_init(CURL_GLOBAL_ALL);
441 ret = test_all_links();
446 curl_global_cleanup();
int print_instructions(bool good, const astring &program_name)
The application_shell is a base object for console programs.
a_sprintf is a specialization of astring that provides printf style support.
contents * access()
A non-constant access of the underlying C-array. BE REALLY CAREFUL.
int length() const
Returns the current reported length of the allocated C array.
outcome zap(int start, int end)
Deletes from "this" the objects inclusively between "start" and "end".
Provides a dynamically resizable ASCII character string.
const char * s() const
synonym for observe. the 's' stands for "string", if that helps.
int convert(int default_value) const
Converts the string into a corresponding integer.
void shrink()
changes all occurrences of "to_replace" into "new_string".
auto_synchronizer simplifies concurrent code by automatically unlocking.
A simple object that wraps a templated array of ints.
Provides file managment services using the standard I/O support.
Provides operations commonly needed on file names.
const basis::astring & raw() const
returns the astring that we're holding onto for the path.
filename basename() const
returns the base of the filename; no directory.
tree * next()
Returns a pointer to the next tree in the direction of traversal.
A dynamically linked tree with an arbitrary number of branches.
Provides a platform-independent object for adding threads to a program.
Manages a collection of threads.
int elements() const
the maximum number of elements currently allowed in this amorph.
contents * borrow(int field)
Returns a pointer to the information at the index "field".
A unique identifier based on integers.
#define SETUP_COMBO_LOGGER
a macro that retasks the program-wide logger as a combo_logger.
#define non_continuable_error(c, f, i)
an extra piece of information used, if available, in bounds_halt below.
#define formal(parameter)
This macro just eats what it's passed; it marks unused formal parameters.
#define NULL_POINTER
The value representing a pointer to nothing.
#define DEFINE_CLASS_NAME(objname)
Defines the name of a class by providing a couple standard methods.
#define FUNCDEF(func_in)
FUNCDEF sets the name of a function (and plugs it into the callstack).
Provides macros that implement the 'main' program of an application.
#define HOOPLE_MAIN(obj_name, obj_args)
options that should work for most unix and linux apps.
size_t data_sink(void *formal(ptr), size_t size, size_t number, void *formal(stream))
const int MAXIMUM_ATTEMPTS
const char * FAKE_AGENT_STRING
marks_checker * main_program
const int TIME_PER_REQUEST_IN_SEC
const int MAXIMUM_THREADS
Implements an application lock to ensure only one is running at once.
The guards collection helps in testing preconditions and reporting errors.
const int SECOND_ms
Number of milliseconds in a second.
A platform independent way to obtain the timestamp of a file.
A logger that sends to the console screen using the standard output device.
An extension to floating point primitives providing approximate equality.
A dynamic container class that holds any kind of object via pointers.
#define static_class_name()
Aids in achievement of platform independence.