spam_o_matic.pl

  1: #!/usr/bin/perl

  3: # spam_o_matic.pl -- automatically respond to job ads on dice

  5: # Written by Robert J. Brown
  6: # rj@elilabs.com
  7: # http://www.elilabs.com/~rj
  8: # Fri Jul 19 00:28:42 CDT 2002

 10: # Copyright 2002, 2003 Elijah Laboratories Inc.
 11: # ALL RIGHTS RESERVED WORLDWIDE

 13: # ----------------------------------------------------------------

 15: ### version 1 Fri Jul 19 00:28:42 CDT 2002 rj@elilabs.com

 17: # Initial release.

 19: ### version 2 Wed May 21 04:04:22 CDT 2003 rj@elilabs.com

 21: # Major modification to web crawling code required because Dice
 22: # totally revamped their site.

 24: ### version 3 Mon Feb  2 14:34:07 EST 2004

 26: # Minor edits to fix page format changes made by dice.
 27: # Chaged names.index.jsp file to select by area codes.

 29: ### version 4 Fri Mar 26 09:41:05 EST 2004

 31: # Added bad_words file to specify words that will cauuse a job ad to
 32: # be rejected if they appear in that job ad.

 34: # Added output to STDERR to show what is happening.  This is useful
 35: # for debugging the bad_words file, although it is pretty noisy.
 36: # Probably want to log to a file for production runs, and roll this
 37: # log frequently, purging old stuff.

 39: # Added debug mode: if spam_o_matic.pl is invoked with a non-empty
 40: # argument list, then it runs in debug mode.  This means that no email
 41: # will be sent, no output will go to stdout, only stderr, and the
 42: # history database will not be updated.  This is useful for making
 43: # test runs after editing the bad_words file.

 45: ### BUGS:

 47: # Thu May 22 12:01:42 CDT 2003 rj@elilabs.com

 49: # Not traversing additional index pages beyond the first page.  Seems
 50: # to be because lynx is not handling the javascript stuff.  When ver 1
 51: # was written, Dice did not use javascript.

 53: # ----------------------------------------------------------------

 55: #               THE INFAMOUS SPAM-O-MATIC RESUME MAILER
 56: #                R. J. Brown, Elijah Laboratories Inc.

 58: # This is an automated web client that does data mining by crawling
 59: # around the dice jobs board.  The email address and job title are
 60: # extracted from each job ad meeting selection criteria.  This is then
 61: # used to email the jobshop or headhunter a resume.

 63: # Any email addresses that should *NOT* be sent a copy of your resume
 64: # should be listed, 1 per line, in the file $no_spam, along with an
 65: # expiration date, so that the shop will not be omitted for all
 66: # eternity.

 68: # The data files, follow the usual comment conventions: any blank
 69: # line, and any line starting with a "#" character will be treated as
 70: # a comment, and thus ignored.

 72: # ASSUMPTIONS: This script assumes you are running on a unix-like
 73: # system with perl, lynx, and a live internet connection.


 76: # ---------------- BEGIN USER CUSTOMIZATION ----------------

 78: # Adjust the following file names/paths to suit your needs...

 80: # the text of the email letter to spam the jobshops with
 81: $email_letter = "/home/rj/work/jobshops/spam/dice/cron/email_letter.txt";

 83: # the list of email addresses to *NOT* spam
 84: $no_spam = "/home/rj/work/jobshops/spam/dice/cron/no_spam.dat";

 86: # the history database name
 87: $hist_db = "/home/rj/work/jobshops/spam/dice/cron/spam_o_matic_history";

 89: # waiting period in days during which duplicate job ads will not be replied to
 90: $hist_waiting_days = 14;                # wait 2 weeks to resubmit for the same job

 92: # NOTE: You may completely reset the history mechanism by manually
 93: # deleting the $hist_db file.  There is presently no easy way to reset
 94: # a single entry.

 96: # The filename and path to the list of bad words that cause us to not
 97: # respond to a job ad.

 99: $bad_words = "/home/rj/work/jobshops/spam/dice/cron/bad_words.dat";

101: # ---------------- END USER CUSTOMIZATION ----------------


104: # load library routines
105: use Time::Local;

107: # The DEBUG flag tells us not to actually send any emails, or write
108: # the log file, or update the history database.  This is used for
109: # testing, especially to test BAD_WORDS.

111: $DEBUG = ($ARGV[0] eq '' ? 0 : 1);                # set debug flag to proper state
112: print STDERR "\n---------------- DEBUG MODE! ----------------\n\n" if $DEBUG;

114: $num_ads = 0;
115: $num_rejects = 0;
116: $num_ok = 0;

118: # set today's date in epoch time
119: $epoch_today = time;

121: # set the history waiting period in seconds
122: $hist_waiting_peroid = 24*60*60 * $hist_waiting_days;

124: # set TERM=vt100 for lynx <sigh>
125: $ENV{"TERM"} = "vt100";

127: open(NO_SPAM, $no_spam)
128:   || die "cannot open $no_spam: $!";

130: open(BAD_WORDS, $bad_words)
131:   || die "cannot open $bad_words: $!";

133: open(EMAIL_LETTER, $email_letter)
134:   || die "cannot open $email_letter: $!";
135: close(EMAIL_LETTER);                                # we were just testing its availability

137: # open the history database or create it
138: dbmopen(%hist, $hist_db, 0600)
139:   || die "cannot open $hist_db: $!";

141: # read the negative file into a hash for faster checking
142: while (<NO_SPAM>) {
143:   chomp;
144:   if (/\#/) {                                        # skip comment lines
145:     next;
146:   }
147:   if (/\S/) {                                        # only process non-blank lines
148:     ($email, $date, $comment) = split(/\s+/);        # parse the line
149:     $no_spam{$email} = $date;                        # load it into the hash
150:   }
151: }

153: # read the bad words list into an array for faster checking
154: $i = 0;
155: while (<BAD_WORDS>) {
156:   chomp;

158:   next if $_ eq "";                                # skip blank lines
159:   next if /^#/;                                        # skip comments

161:   $bad_word[$i] = $_;                                # save the bad word or phrase
162:   $i++;
163: }

165: # connect with the srever to get the cookies.  We save the output just for psoterity at this point.
166: $cmd = "lynx -accept_all_cookies"
167:   . " -cookie_file=dice.cookies"
168:   . " -source"
169:   . " 'http://seeker.dice.com/jobsearch/index.jsp'";

171: `$cmd`;

173: # read the first page of results

175: $first_page = 1;                                # true

177: # for all the pages in the search results
178: do {

180:   if ($first_page == 1) {

182:     # run lynx on the url to fetch that index page

184:     $cmd =  "grep -v '^#' names.index.jsp"
185:       . " | tr '\\n' '&'"
186:         . " | lynx -post_data"
187:           . " -accept_all_cookies"
188:             . " -cookie_file=dice.cookies"
189:               . " -source 'http://seeker.dice.com/jobsearch/servlet/JobSearch'";

191:     open(INDEX_PAGE, "$cmd |")
192:       || warn "cannot fetch first index page: $!";

194:     $first_page = 0;                                # false

196:   } else {

198:     $post_data = "PREV_DOC=$prev_doc&"
199:                     . "NEXT_DOC=$next_doc&"
200:                       . "op=1001&"
201:                         . "NEXT.x=4&"
202:                           . "NEXT.y=7&"
203:                             . "NEXT=1";

205:     $cmd = "lynx -accept_all_cookies"
206:           . " -cookie_file=dice.cookies"
207:             . " -source 'http://seeker.dice.com/jobsearch/servlet/JobSearch?$post_data'";

209:     open(INDEX_PAGE, "$cmd |")
210:       || warn "cannot fetch non-first index page: $!";
211:   }

213:   $prev_doc = 0;
214:   $next_doc = 0;

216:   # for each line in the index page...
217:  index_line: while (<INDEX_PAGE>) {
218:     chomp;
219:     s/\r//g;

221:     # scan for prev/next job ad numbers for prev/next web pages
222:     if (/<input type="hidden" name="PREV_DOC" value="([0-9]+)">/) { # is this the next job ad number?
223:       $prev_doc = $1;                                # yes, remember it

225:     } elsif (/<input type="hidden" name="NEXT_DOC" value="([0-9]+)">/) { # id this the previous job ad number ?
226:       $next_doc = $1;                                # yes, remember it

228:     } elsif (/\(score:.+(.+)/) {                # is it the url of a job ad?

230:       # read next line -- they changed the format, *AGAIN*... <groan>

232:       next index_line unless ($_ = <INDEX_PAGE>);

234:       # extract the job ad's url from it

236:       next index_line unless (/href="(.*)"/);

238:       # yes, look at job ads and process them

240:       $job_ad_url = "http://seeker.dice.com" . $1;

242:       # fetch that job ad as formatted text, not raw html
243:       open(JOB_AD_PAGE, "lynx -dump '$job_ad_url' |")
244:         || warn "cannot fetch index page $_: $!";

246:       $num_ads++;                                # count the job ad

248:       # initially clear out the fields we are looking for
249:       $job_title = "";
250:       $position = "";
251:       $email_address = "";

253:       # for each line in the job ad...

255:       while (<JOB_AD_PAGE>) {
256:         chomp;

258:         # Here we gather the data we need to send the email

260:         if (/Title: +(.+)/) {                        # extract the job title
261:           $job_title = $1;
262:         }

264:         if (/Position ID: +(.+)/) {                # extract job id
265:           $position = $1;
266:         }

268:         if (/E-mail:.*\](.+)/) {                # extract the email address
269:           $email_address = $1;
270:         }

272:         # Here we check for "bad words".  These are words that, if we
273:         # find them in a job ad, then we DO NOT want to respond to
274:         # that ad.  These words were read in from the bad_words.dat
275:         # file when the spam_o_matic started up, and placed in the
276:         # @bad_words array for ready reference.

278:         foreach $word (@bad_word) {
279:           if (/$word/i) {                        # keyword match ignoring case
280:             print STDERR "REJECT: $word ==> $_\n";
281:             $num_rejects++;                        # count the reject
282:             next index_line;
283:           }
284:         }

286:       }

288:       # did we find everything we were looking for?
289:       if ($job_title eq ""
290:           || $position eq ""
291:           || $email_address eq "") {

293:         next index_line;                        # no, skip this one
294:       }

296:       # Yes, we found them all.  Now that we have the email address,
297:       # check the negative file to see if we should skip this one.

299:       # is there an entry for that email address?
300:       if ($exp_date = $no_spam{$email_address}) {

302:         # yes, parse it and convert the expiration date to epoch time
303:         ($mm, $dd, $yy) = split(/\//, $exp_date);
304:         $mm -= 1;                                # month is 0-origin <groan>
305:         $yy += 2000;                                # year must be full 4 digits
306:         $epoch_exp = timelocal(0, 0, 0, $dd, $mm, $yy);

308:         # is it expired?
309:         if ($epoch_today < $epoch_exp) {
310:           next index_line;                        # no, do not email this guy
311:         }
312:       }

314:       # Check the history database to see if this job ad has already
315:       # been answered within the waiting period.

317:       # form the key from the email address
318:       $hist_key = $email_address . " " . $position;

320:       # have we applied for this job before?
321:       if ($hist_timestamp=$hist{$hist_key}) {

323:         # yes, was it so long ago that we should reapply?
324:         if ($hist_timestamp + $hist_waiting_peroid > $epoch_today) {
325:           next;                                        # no, do not email this guy
326:         }
327:       }

329:       # send the email

331:       $command = "{ echo 'Subject: ID: " . $position . " -- " . $job_title . "' ;"
332:         . " echo 'ReplyTo: rj\@elilabs.com' ;"
333:         . " echo 'X-Url: " . $job_ad_url . "' ;"
334:           . " cat " . $email_letter . " ; }" . " | /usr/lib/sendmail " . $email_address;

336:       printf STDERR ("FOUND: %40s  %s\n",  $email_address, $job_title);
337:       printf("%s\n", $command) unless $DEBUG;

339:       $num_ok++;                                # count the good ad

341:       `$command` unless $DEBUG;

343:       # update the history database so we don't send duplicates for the same job
344:       $hist{$hist_key} = $epoch_today unless $DEBUG;
345:     }
346:   } # for each line in index page
347: } while (0); # ($next_doc != "0");                        # loop thru index pages

349: print STDERR "\n\n---------------- PROCESSED: $num_ads  REJECTED: $num_rejects  ACCEPTED: $num_ok ----------------\n\n";