1: #!/usr/bin/perl
3: # spam_o_matic.pl -- automatically respond to job ads on dice
5: # Written by Robert J. Brown
6: # rj@elilabs.com
7: # http://www.elilabs.com/~rj
8: # Fri Jul 19 00:28:42 CDT 2002
10: # Copyright 2002, 2003 Elijah Laboratories Inc.
11: # ALL RIGHTS RESERVED WORLDWIDE
13: # ----------------------------------------------------------------
15: ### version 1 Fri Jul 19 00:28:42 CDT 2002 rj@elilabs.com
17: # Initial release.
19: ### version 2 Wed May 21 04:04:22 CDT 2003 rj@elilabs.com
21: # Major modification to web crawling code required because Dice
22: # totally revamped their site.
24: ### version 3 Mon Feb 2 14:34:07 EST 2004
26: # Minor edits to fix page format changes made by dice.
27: # Chaged names.index.jsp file to select by area codes.
29: ### version 4 Fri Mar 26 09:41:05 EST 2004
31: # Added bad_words file to specify words that will cauuse a job ad to
32: # be rejected if they appear in that job ad.
34: # Added output to STDERR to show what is happening. This is useful
35: # for debugging the bad_words file, although it is pretty noisy.
36: # Probably want to log to a file for production runs, and roll this
37: # log frequently, purging old stuff.
39: # Added debug mode: if spam_o_matic.pl is invoked with a non-empty
40: # argument list, then it runs in debug mode. This means that no email
41: # will be sent, no output will go to stdout, only stderr, and the
42: # history database will not be updated. This is useful for making
43: # test runs after editing the bad_words file.
45: ### BUGS:
47: # Thu May 22 12:01:42 CDT 2003 rj@elilabs.com
49: # Not traversing additional index pages beyond the first page. Seems
50: # to be because lynx is not handling the javascript stuff. When ver 1
51: # was written, Dice did not use javascript.
53: # ----------------------------------------------------------------
55: # THE INFAMOUS SPAM-O-MATIC RESUME MAILER
56: # R. J. Brown, Elijah Laboratories Inc.
58: # This is an automated web client that does data mining by crawling
59: # around the dice jobs board. The email address and job title are
60: # extracted from each job ad meeting selection criteria. This is then
61: # used to email the jobshop or headhunter a resume.
63: # Any email addresses that should *NOT* be sent a copy of your resume
64: # should be listed, 1 per line, in the file $no_spam, along with an
65: # expiration date, so that the shop will not be omitted for all
66: # eternity.
68: # The data files, follow the usual comment conventions: any blank
69: # line, and any line starting with a "#" character will be treated as
70: # a comment, and thus ignored.
72: # ASSUMPTIONS: This script assumes you are running on a unix-like
73: # system with perl, lynx, and a live internet connection.
76: # ---------------- BEGIN USER CUSTOMIZATION ----------------
78: # Adjust the following file names/paths to suit your needs...
80: # the text of the email letter to spam the jobshops with
81: $email_letter = "/home/rj/work/jobshops/spam/dice/cron/email_letter.txt";
83: # the list of email addresses to *NOT* spam
84: $no_spam = "/home/rj/work/jobshops/spam/dice/cron/no_spam.dat";
86: # the history database name
87: $hist_db = "/home/rj/work/jobshops/spam/dice/cron/spam_o_matic_history";
89: # waiting period in days during which duplicate job ads will not be replied to
90: $hist_waiting_days = 14; # wait 2 weeks to resubmit for the same job
92: # NOTE: You may completely reset the history mechanism by manually
93: # deleting the $hist_db file. There is presently no easy way to reset
94: # a single entry.
96: # The filename and path to the list of bad words that cause us to not
97: # respond to a job ad.
99: $bad_words = "/home/rj/work/jobshops/spam/dice/cron/bad_words.dat";
101: # ---------------- END USER CUSTOMIZATION ----------------
104: # load library routines
105: use Time::Local;
107: # The DEBUG flag tells us not to actually send any emails, or write
108: # the log file, or update the history database. This is used for
109: # testing, especially to test BAD_WORDS.
111: $DEBUG = ($ARGV[0] eq '' ? 0 : 1); # set debug flag to proper state
112: print STDERR "\n---------------- DEBUG MODE! ----------------\n\n" if $DEBUG;
114: $num_ads = 0;
115: $num_rejects = 0;
116: $num_ok = 0;
118: # set today's date in epoch time
119: $epoch_today = time;
121: # set the history waiting period in seconds
122: $hist_waiting_peroid = 24*60*60 * $hist_waiting_days;
124: # set TERM=vt100 for lynx <sigh>
125: $ENV{"TERM"} = "vt100";
127: open(NO_SPAM, $no_spam)
128: || die "cannot open $no_spam: $!";
130: open(BAD_WORDS, $bad_words)
131: || die "cannot open $bad_words: $!";
133: open(EMAIL_LETTER, $email_letter)
134: || die "cannot open $email_letter: $!";
135: close(EMAIL_LETTER); # we were just testing its availability
137: # open the history database or create it
138: dbmopen(%hist, $hist_db, 0600)
139: || die "cannot open $hist_db: $!";
141: # read the negative file into a hash for faster checking
142: while (<NO_SPAM>) {
143: chomp;
144: if (/\#/) { # skip comment lines
145: next;
146: }
147: if (/\S/) { # only process non-blank lines
148: ($email, $date, $comment) = split(/\s+/); # parse the line
149: $no_spam{$email} = $date; # load it into the hash
150: }
151: }
153: # read the bad words list into an array for faster checking
154: $i = 0;
155: while (<BAD_WORDS>) {
156: chomp;
158: next if $_ eq ""; # skip blank lines
159: next if /^#/; # skip comments
161: $bad_word[$i] = $_; # save the bad word or phrase
162: $i++;
163: }
165: # connect with the srever to get the cookies. We save the output just for psoterity at this point.
166: $cmd = "lynx -accept_all_cookies"
167: . " -cookie_file=dice.cookies"
168: . " -source"
169: . " 'http://seeker.dice.com/jobsearch/index.jsp'";
171: `$cmd`;
173: # read the first page of results
175: $first_page = 1; # true
177: # for all the pages in the search results
178: do {
180: if ($first_page == 1) {
182: # run lynx on the url to fetch that index page
184: $cmd = "grep -v '^#' names.index.jsp"
185: . " | tr '\\n' '&'"
186: . " | lynx -post_data"
187: . " -accept_all_cookies"
188: . " -cookie_file=dice.cookies"
189: . " -source 'http://seeker.dice.com/jobsearch/servlet/JobSearch'";
191: open(INDEX_PAGE, "$cmd |")
192: || warn "cannot fetch first index page: $!";
194: $first_page = 0; # false
196: } else {
198: $post_data = "PREV_DOC=$prev_doc&"
199: . "NEXT_DOC=$next_doc&"
200: . "op=1001&"
201: . "NEXT.x=4&"
202: . "NEXT.y=7&"
203: . "NEXT=1";
205: $cmd = "lynx -accept_all_cookies"
206: . " -cookie_file=dice.cookies"
207: . " -source 'http://seeker.dice.com/jobsearch/servlet/JobSearch?$post_data'";
209: open(INDEX_PAGE, "$cmd |")
210: || warn "cannot fetch non-first index page: $!";
211: }
213: $prev_doc = 0;
214: $next_doc = 0;
216: # for each line in the index page...
217: index_line: while (<INDEX_PAGE>) {
218: chomp;
219: s/\r//g;
221: # scan for prev/next job ad numbers for prev/next web pages
222: if (/<input type="hidden" name="PREV_DOC" value="([0-9]+)">/) { # is this the next job ad number?
223: $prev_doc = $1; # yes, remember it
225: } elsif (/<input type="hidden" name="NEXT_DOC" value="([0-9]+)">/) { # id this the previous job ad number ?
226: $next_doc = $1; # yes, remember it
228: } elsif (/\(score:.+(.+)/) { # is it the url of a job ad?
230: # read next line -- they changed the format, *AGAIN*... <groan>
232: next index_line unless ($_ = <INDEX_PAGE>);
234: # extract the job ad's url from it
236: next index_line unless (/href="(.*)"/);
238: # yes, look at job ads and process them
240: $job_ad_url = "http://seeker.dice.com" . $1;
242: # fetch that job ad as formatted text, not raw html
243: open(JOB_AD_PAGE, "lynx -dump '$job_ad_url' |")
244: || warn "cannot fetch index page $_: $!";
246: $num_ads++; # count the job ad
248: # initially clear out the fields we are looking for
249: $job_title = "";
250: $position = "";
251: $email_address = "";
253: # for each line in the job ad...
255: while (<JOB_AD_PAGE>) {
256: chomp;
258: # Here we gather the data we need to send the email
260: if (/Title: +(.+)/) { # extract the job title
261: $job_title = $1;
262: }
264: if (/Position ID: +(.+)/) { # extract job id
265: $position = $1;
266: }
268: if (/E-mail:.*\](.+)/) { # extract the email address
269: $email_address = $1;
270: }
272: # Here we check for "bad words". These are words that, if we
273: # find them in a job ad, then we DO NOT want to respond to
274: # that ad. These words were read in from the bad_words.dat
275: # file when the spam_o_matic started up, and placed in the
276: # @bad_words array for ready reference.
278: foreach $word (@bad_word) {
279: if (/$word/i) { # keyword match ignoring case
280: print STDERR "REJECT: $word ==> $_\n";
281: $num_rejects++; # count the reject
282: next index_line;
283: }
284: }
286: }
288: # did we find everything we were looking for?
289: if ($job_title eq ""
290: || $position eq ""
291: || $email_address eq "") {
293: next index_line; # no, skip this one
294: }
296: # Yes, we found them all. Now that we have the email address,
297: # check the negative file to see if we should skip this one.
299: # is there an entry for that email address?
300: if ($exp_date = $no_spam{$email_address}) {
302: # yes, parse it and convert the expiration date to epoch time
303: ($mm, $dd, $yy) = split(/\//, $exp_date);
304: $mm -= 1; # month is 0-origin <groan>
305: $yy += 2000; # year must be full 4 digits
306: $epoch_exp = timelocal(0, 0, 0, $dd, $mm, $yy);
308: # is it expired?
309: if ($epoch_today < $epoch_exp) {
310: next index_line; # no, do not email this guy
311: }
312: }
314: # Check the history database to see if this job ad has already
315: # been answered within the waiting period.
317: # form the key from the email address
318: $hist_key = $email_address . " " . $position;
320: # have we applied for this job before?
321: if ($hist_timestamp=$hist{$hist_key}) {
323: # yes, was it so long ago that we should reapply?
324: if ($hist_timestamp + $hist_waiting_peroid > $epoch_today) {
325: next; # no, do not email this guy
326: }
327: }
329: # send the email
331: $command = "{ echo 'Subject: ID: " . $position . " -- " . $job_title . "' ;"
332: . " echo 'ReplyTo: rj\@elilabs.com' ;"
333: . " echo 'X-Url: " . $job_ad_url . "' ;"
334: . " cat " . $email_letter . " ; }" . " | /usr/lib/sendmail " . $email_address;
336: printf STDERR ("FOUND: %40s %s\n", $email_address, $job_title);
337: printf("%s\n", $command) unless $DEBUG;
339: $num_ok++; # count the good ad
341: `$command` unless $DEBUG;
343: # update the history database so we don't send duplicates for the same job
344: $hist{$hist_key} = $epoch_today unless $DEBUG;
345: }
346: } # for each line in index page
347: } while (0); # ($next_doc != "0"); # loop thru index pages
349: print STDERR "\n\n---------------- PROCESSED: $num_ads REJECTED: $num_rejects ACCEPTED: $num_ok ----------------\n\n";