1: #!/usr/bin/perl 3: # spam_o_matic.pl -- automatically respond to job ads on dice 5: # Written by Robert J. Brown 6: # rj@elilabs.com 7: # http://www.elilabs.com/~rj 8: # Fri Jul 19 00:28:42 CDT 2002 10: # Copyright 2002, 2003 Elijah Laboratories Inc. 11: # ALL RIGHTS RESERVED WORLDWIDE 13: # ---------------------------------------------------------------- 15: ### version 1 Fri Jul 19 00:28:42 CDT 2002 rj@elilabs.com 17: # Initial release. 19: ### version 2 Wed May 21 04:04:22 CDT 2003 rj@elilabs.com 21: # Major modification to web crawling code required because Dice 22: # totally revamped their site. 24: ### version 3 Mon Feb 2 14:34:07 EST 2004 26: # Minor edits to fix page format changes made by dice. 27: # Chaged names.index.jsp file to select by area codes. 29: ### version 4 Fri Mar 26 09:41:05 EST 2004 31: # Added bad_words file to specify words that will cauuse a job ad to 32: # be rejected if they appear in that job ad. 34: # Added output to STDERR to show what is happening. This is useful 35: # for debugging the bad_words file, although it is pretty noisy. 36: # Probably want to log to a file for production runs, and roll this 37: # log frequently, purging old stuff. 39: # Added debug mode: if spam_o_matic.pl is invoked with a non-empty 40: # argument list, then it runs in debug mode. This means that no email 41: # will be sent, no output will go to stdout, only stderr, and the 42: # history database will not be updated. This is useful for making 43: # test runs after editing the bad_words file. 45: ### BUGS: 47: # Thu May 22 12:01:42 CDT 2003 rj@elilabs.com 49: # Not traversing additional index pages beyond the first page. Seems 50: # to be because lynx is not handling the javascript stuff. When ver 1 51: # was written, Dice did not use javascript. 53: # ---------------------------------------------------------------- 55: # THE INFAMOUS SPAM-O-MATIC RESUME MAILER 56: # R. J. Brown, Elijah Laboratories Inc. 58: # This is an automated web client that does data mining by crawling 59: # around the dice jobs board. The email address and job title are 60: # extracted from each job ad meeting selection criteria. This is then 61: # used to email the jobshop or headhunter a resume. 63: # Any email addresses that should *NOT* be sent a copy of your resume 64: # should be listed, 1 per line, in the file $no_spam, along with an 65: # expiration date, so that the shop will not be omitted for all 66: # eternity. 68: # The data files, follow the usual comment conventions: any blank 69: # line, and any line starting with a "#" character will be treated as 70: # a comment, and thus ignored. 72: # ASSUMPTIONS: This script assumes you are running on a unix-like 73: # system with perl, lynx, and a live internet connection. 76: # ---------------- BEGIN USER CUSTOMIZATION ---------------- 78: # Adjust the following file names/paths to suit your needs... 80: # the text of the email letter to spam the jobshops with 81: $email_letter = "/home/rj/work/jobshops/spam/dice/cron/email_letter.txt"; 83: # the list of email addresses to *NOT* spam 84: $no_spam = "/home/rj/work/jobshops/spam/dice/cron/no_spam.dat"; 86: # the history database name 87: $hist_db = "/home/rj/work/jobshops/spam/dice/cron/spam_o_matic_history"; 89: # waiting period in days during which duplicate job ads will not be replied to 90: $hist_waiting_days = 14; # wait 2 weeks to resubmit for the same job 92: # NOTE: You may completely reset the history mechanism by manually 93: # deleting the $hist_db file. There is presently no easy way to reset 94: # a single entry. 96: # The filename and path to the list of bad words that cause us to not 97: # respond to a job ad. 99: $bad_words = "/home/rj/work/jobshops/spam/dice/cron/bad_words.dat"; 101: # ---------------- END USER CUSTOMIZATION ---------------- 104: # load library routines 105: use Time::Local; 107: # The DEBUG flag tells us not to actually send any emails, or write 108: # the log file, or update the history database. This is used for 109: # testing, especially to test BAD_WORDS. 111: $DEBUG = ($ARGV[0] eq '' ? 0 : 1); # set debug flag to proper state 112: print STDERR "\n---------------- DEBUG MODE! ----------------\n\n" if $DEBUG; 114: $num_ads = 0; 115: $num_rejects = 0; 116: $num_ok = 0; 118: # set today's date in epoch time 119: $epoch_today = time; 121: # set the history waiting period in seconds 122: $hist_waiting_peroid = 24*60*60 * $hist_waiting_days; 124: # set TERM=vt100 for lynx <sigh> 125: $ENV{"TERM"} = "vt100"; 127: open(NO_SPAM, $no_spam) 128: || die "cannot open $no_spam: $!"; 130: open(BAD_WORDS, $bad_words) 131: || die "cannot open $bad_words: $!"; 133: open(EMAIL_LETTER, $email_letter) 134: || die "cannot open $email_letter: $!"; 135: close(EMAIL_LETTER); # we were just testing its availability 137: # open the history database or create it 138: dbmopen(%hist, $hist_db, 0600) 139: || die "cannot open $hist_db: $!"; 141: # read the negative file into a hash for faster checking 142: while (<NO_SPAM>) { 143: chomp; 144: if (/\#/) { # skip comment lines 145: next; 146: } 147: if (/\S/) { # only process non-blank lines 148: ($email, $date, $comment) = split(/\s+/); # parse the line 149: $no_spam{$email} = $date; # load it into the hash 150: } 151: } 153: # read the bad words list into an array for faster checking 154: $i = 0; 155: while (<BAD_WORDS>) { 156: chomp; 158: next if $_ eq ""; # skip blank lines 159: next if /^#/; # skip comments 161: $bad_word[$i] = $_; # save the bad word or phrase 162: $i++; 163: } 165: # connect with the srever to get the cookies. We save the output just for psoterity at this point. 166: $cmd = "lynx -accept_all_cookies" 167: . " -cookie_file=dice.cookies" 168: . " -source" 169: . " 'http://seeker.dice.com/jobsearch/index.jsp'"; 171: `$cmd`; 173: # read the first page of results 175: $first_page = 1; # true 177: # for all the pages in the search results 178: do { 180: if ($first_page == 1) { 182: # run lynx on the url to fetch that index page 184: $cmd = "grep -v '^#' names.index.jsp" 185: . " | tr '\\n' '&'" 186: . " | lynx -post_data" 187: . " -accept_all_cookies" 188: . " -cookie_file=dice.cookies" 189: . " -source 'http://seeker.dice.com/jobsearch/servlet/JobSearch'"; 191: open(INDEX_PAGE, "$cmd |") 192: || warn "cannot fetch first index page: $!"; 194: $first_page = 0; # false 196: } else { 198: $post_data = "PREV_DOC=$prev_doc&" 199: . "NEXT_DOC=$next_doc&" 200: . "op=1001&" 201: . "NEXT.x=4&" 202: . "NEXT.y=7&" 203: . "NEXT=1"; 205: $cmd = "lynx -accept_all_cookies" 206: . " -cookie_file=dice.cookies" 207: . " -source 'http://seeker.dice.com/jobsearch/servlet/JobSearch?$post_data'"; 209: open(INDEX_PAGE, "$cmd |") 210: || warn "cannot fetch non-first index page: $!"; 211: } 213: $prev_doc = 0; 214: $next_doc = 0; 216: # for each line in the index page... 217: index_line: while (<INDEX_PAGE>) { 218: chomp; 219: s/\r//g; 221: # scan for prev/next job ad numbers for prev/next web pages 222: if (/<input type="hidden" name="PREV_DOC" value="([0-9]+)">/) { # is this the next job ad number? 223: $prev_doc = $1; # yes, remember it 225: } elsif (/<input type="hidden" name="NEXT_DOC" value="([0-9]+)">/) { # id this the previous job ad number ? 226: $next_doc = $1; # yes, remember it 228: } elsif (/\(score:.+(.+)/) { # is it the url of a job ad? 230: # read next line -- they changed the format, *AGAIN*... <groan> 232: next index_line unless ($_ = <INDEX_PAGE>); 234: # extract the job ad's url from it 236: next index_line unless (/href="(.*)"/); 238: # yes, look at job ads and process them 240: $job_ad_url = "http://seeker.dice.com" . $1; 242: # fetch that job ad as formatted text, not raw html 243: open(JOB_AD_PAGE, "lynx -dump '$job_ad_url' |") 244: || warn "cannot fetch index page $_: $!"; 246: $num_ads++; # count the job ad 248: # initially clear out the fields we are looking for 249: $job_title = ""; 250: $position = ""; 251: $email_address = ""; 253: # for each line in the job ad... 255: while (<JOB_AD_PAGE>) { 256: chomp; 258: # Here we gather the data we need to send the email 260: if (/Title: +(.+)/) { # extract the job title 261: $job_title = $1; 262: } 264: if (/Position ID: +(.+)/) { # extract job id 265: $position = $1; 266: } 268: if (/E-mail:.*\](.+)/) { # extract the email address 269: $email_address = $1; 270: } 272: # Here we check for "bad words". These are words that, if we 273: # find them in a job ad, then we DO NOT want to respond to 274: # that ad. These words were read in from the bad_words.dat 275: # file when the spam_o_matic started up, and placed in the 276: # @bad_words array for ready reference. 278: foreach $word (@bad_word) { 279: if (/$word/i) { # keyword match ignoring case 280: print STDERR "REJECT: $word ==> $_\n"; 281: $num_rejects++; # count the reject 282: next index_line; 283: } 284: } 286: } 288: # did we find everything we were looking for? 289: if ($job_title eq "" 290: || $position eq "" 291: || $email_address eq "") { 293: next index_line; # no, skip this one 294: } 296: # Yes, we found them all. Now that we have the email address, 297: # check the negative file to see if we should skip this one. 299: # is there an entry for that email address? 300: if ($exp_date = $no_spam{$email_address}) { 302: # yes, parse it and convert the expiration date to epoch time 303: ($mm, $dd, $yy) = split(/\//, $exp_date); 304: $mm -= 1; # month is 0-origin <groan> 305: $yy += 2000; # year must be full 4 digits 306: $epoch_exp = timelocal(0, 0, 0, $dd, $mm, $yy); 308: # is it expired? 309: if ($epoch_today < $epoch_exp) { 310: next index_line; # no, do not email this guy 311: } 312: } 314: # Check the history database to see if this job ad has already 315: # been answered within the waiting period. 317: # form the key from the email address 318: $hist_key = $email_address . " " . $position; 320: # have we applied for this job before? 321: if ($hist_timestamp=$hist{$hist_key}) { 323: # yes, was it so long ago that we should reapply? 324: if ($hist_timestamp + $hist_waiting_peroid > $epoch_today) { 325: next; # no, do not email this guy 326: } 327: } 329: # send the email 331: $command = "{ echo 'Subject: ID: " . $position . " -- " . $job_title . "' ;" 332: . " echo 'ReplyTo: rj\@elilabs.com' ;" 333: . " echo 'X-Url: " . $job_ad_url . "' ;" 334: . " cat " . $email_letter . " ; }" . " | /usr/lib/sendmail " . $email_address; 336: printf STDERR ("FOUND: %40s %s\n", $email_address, $job_title); 337: printf("%s\n", $command) unless $DEBUG; 339: $num_ok++; # count the good ad 341: `$command` unless $DEBUG; 343: # update the history database so we don't send duplicates for the same job 344: $hist{$hist_key} = $epoch_today unless $DEBUG; 345: } 346: } # for each line in index page 347: } while (0); # ($next_doc != "0"); # loop thru index pages 349: print STDERR "\n\n---------------- PROCESSED: $num_ads REJECTED: $num_rejects ACCEPTED: $num_ok ----------------\n\n";