1 package Classifier::MailParse; 2 3 # --------------------------------------------------------------------------------------------- 4 # 5 # MailParse.pm --- Parse a mail message or messages into words 6 # 7 # Copyright (c) 2001-2003 John Graham-Cumming 8 # 9 # --------------------------------------------------------------------------------------------- 10 11 use strict; 12 use locale; 13 use Classifier::WordMangle; 14 15 use MIME::Base64; 16 use MIME::QuotedPrint; 17 18 # HTML entity mapping to character codes, this maps things like & to their corresponding 19 # character code 20 21 my %entityhash = ('aacute' => 224, 'Aacute' => 202, 'Acirc' => 203, 'acirc' => 225, # PROFILE BLOCK START 22 'acute' => 189, 'AElig' => 207, 'aelig' => 229, 'Agrave' => 201, 23 'agrave' => 223, 'amp' => 38, 'Aring' => 206, 'aring' => 228, 24 'atilde' => 226, 'Atilde' => 204, 'Auml' => 196, 'auml' => 228, 25 'brvbar' => 166, 'ccedil' => 230, 'Ccedil' => 208, 'cedil' => 193, 26 'cent' => 162, 'copy' => 169, 'curren' => 164, 'deg' => 185, 27 'divide' => 246, 'Eacute' => 210, 'eacute' => 232, 'ecirc' => 233, 28 'Ecirc' => 211, 'Egrave' => 209, 'egrave' => 231, 'ETH' => 217, 29 'eth' => 239, 'Euml' => 212, 'euml' => 234, 'frac12' => 198, 30 'frac14' => 197, 'frac34' => 199, 'iacute' => 236, 'Iacute' => 214, 31 'icirc' => 237, 'Icirc' => 215, 'iexcl' => 161, 'igrave' => 235, 32 'Igrave' => 213, 'iquest' => 200, 'iuml' => 238, 'Iuml' => 216, 33 'laquo' => 180, 'macr' => 184, 'micro' => 190, 'middot' => 192, 34 'nbsp' => 160, 'not' => 181, 'ntilde' => 240, 'Ntilde' => 218, 35 'oacute' => 242, 'Oacute' => 210, 'Ocirc' => 211, 'ocirc' => 243, 36 'Ograve' => 219, 'ograve' => 241, 'ordf' => 170, 'ordm' => 195, 37 'oslash' => 247, 'Oslash' => 215, 'Otilde' => 212, 'otilde' => 244, 38 'Ouml' => 214, 'ouml' => 246, 'para' => 191, 'plusmn' => 186, 39 'pound' => 163, 'raquo' => 196, 'reg' => 183, 'sect' => 167, 40 'shy' => 182, 'sup1' => 194, 'sup2' => 187, 'sup3' => 188, 41 'szlig' => 223, 'thorn' => 253, 'THORN' => 221, 'times' => 214, 42 'Uacute' => 217, 'uacute' => 249, 'ucirc' => 250, 'Ucirc' => 218, 43 'ugrave' => 248, 'Ugrave' => 216, 'uml' => 168, 'Uuml' => 220, 44 'uuml' => 252, 'Yacute' => 220, 'yacute' => 252, 'yen' => 165, 45 'yuml' => 254); # PROFILE BLOCK STOP 46 47 #---------------------------------------------------------------------------- 48 # new 49 # 50 # Class new() function 51 #---------------------------------------------------------------------------- 52 sub new 53 { 54 my $type = shift; 55 my $self; 56 57 # Used to mangle words into the right shape for classification 58 59 $self->{mangle__} = new Classifier::WordMangle; 60 61 # Hash of word frequences 62 63 $self->{words__} = {}; 64 65 # Total word cout 66 67 $self->{msg_total__} = 0; 68 69 # Internal use for keeping track of a line without touching it 70 71 $self->{ut__} = ''; 72 73 # Specifies the parse mode, 1 means color the output 74 75 $self->{color__} = 0; 76 77 # This will store the from, to, cc and subject from the last parse 78 $self->{from__} = ''; 79 $self->{to__} = ''; 80 $self->{cc__} = ''; 81 $self->{subject__} = ''; 82 83 # This is used to store the words found in the from, to, and subject 84 # lines for use in creating new magnets, it is a list of pairs mapping 85 # a magnet type to a magnet string, e.g. from => popfile@jgc.org 86 87 $self->{quickmagnets__} = {}; 88 89 # These store the current HTML background color2 and font color to 90 # detect "invisible ink" used by spammers 91 92 $self->{htmlbackcolor__} = map_color( $self, 'white' ); 93 $self->{htmlbodycolor__} = map_color( $self, 'white' ); 94 $self->{htmlfontcolor__} = map_color( $self, 'black' ); 95 96 # This is the distance betwee the back color and the font color 97 # as computed using compute_rgb_distance 98 99 $self->{htmlcolordistance__} = 0; 100 101 # This is a mapping between HTML color names and HTML hexadecimal color values used by the 102 # map_color value to get canonical color values 103 104 $self->{color_map__} = { 'aliceblue','f0f8ff', 'antiquewhite','faebd7', 'aqua','00ffff', 'aquamarine','7fffd4', 'azure','f0ffff', # PROFILE BLOCK START 105 'beige','f5f5dc', 'bisque','ffe4c4', 'black','000000', 'blanchedalmond','ffebcd', 'blue','0000ff', 'blueviolet','8a2be2', 106 'brown','a52a2a', 'burlywood','deb887', 'cadetblue','5f9ea0', 'chartreuse','7fff00', 'chocolate','d2691e', 'coral','ff7f50', 107 'cornflowerblue','6495ed', 'cornsilk','fff8dc', 'crimson','dc143c', 'cyan','00ffff', 'darkblue','00008b', 'darkcyan','008b8b', 108 'darkgoldenrod','b8860b', 'darkgray','a9a9a9', 'darkgreen','006400', 'darkkhaki','bdb76b', 'darkmagenta','8b008b', 'darkolivegreen','556b2f', 109 'darkorange','ff8c00', 'darkorchid','9932cc', 'darkred','8b0000', 'darksalmon','e9967a', 'darkseagreen','8fbc8f', 'darkslateblue','483d8b', 110 'darkturquoise','00ced1', 'darkviolet','9400d3', 'deeppink','ff1493', 'deepskyblue','00bfff', 'deepskyblue','2f4f4f', 'dimgray','696969', 111 'dodgerblue','1e90ff', 'firebrick','b22222', 'floralwhite','fffaf0', 'forestgreen','228b22', 'fuchsia','ff00ff', 'gainsboro','dcdcdc', 112 'ghostwhite','f8f8ff', 'gold','ffd700', 'goldenrod','daa520', 'gray','808080', 'green','008000', 'greenyellow','adff2f', 113 'honeydew','f0fff0', 'hotpink','ff69b4', 'indianred','cd5c5c', 'indigo','4b0082', 'ivory','fffff0', 'khaki','f0e68c', 114 'lavender','e6e6fa', 'lavenderblush','fff0f5', 'lawngreen','7cfc00', 'lemonchiffon','fffacd', 'lightblue','add8e6', 115 'lightcoral','f08080', 'lightcyan','e0ffff', 'lightgoldenrodyellow','fafad2', 'lightgreen','90ee90', 'lightgrey','d3d3d3', 116 'lightpink','ffb6c1', 'lightsalmon','ffa07a', 'lightseagreen','20b2aa', 'lightskyblue','87cefa', 'lightslategray','778899', 117 'lightsteelblue','b0c4de', 'lightyellow','ffffe0', 'lime','00ff00', 'limegreen','32cd32', 'linen','faf0e6', 'magenta','ff00ff', 118 'maroon','800000', 'mediumaquamarine','66cdaa', 'mediumblue','0000cd', 'mediumorchid','ba55d3', 'mediumpurple','9370db', 119 'mediumseagreen','3cb371', 'mediumslateblue','7b68ee', 'mediumspringgreen','00fa9a', 'mediumturquoise','48d1cc', 120 'mediumvioletred','c71585', 'midnightblue','191970', 'mintcream','f5fffa', 'mistyrose','ffe4e1', 'moccasin','ffe4b5', 121 'navajowhite','ffdead', 'navy','000080', 'oldlace','fdf5e6', 'olive','808000', 'olivedrab','6b8e23', 'orange','ffa500', 122 'orangered','ff4500', 'orchid','da70d6', 'palegoldenrod','eee8aa', 'palegreen','98fb98', 'paleturquoise','afeeee', 123 'palevioletred','db7093', 'papayawhip','ffefd5', 'peachpuff','ffdab9', 'peru','cd853f', 'pink','ffc0cb', 'plum','dda0dd', 124 'powderblue','b0e0e6', 'purple','800080', 'red','ff0000', 'rosybrown','bc8f8f', 'royalblue','4169e1', 'saddlebrown','8b4513', 125 'salmon','fa8072', 'sandybrown','f4a460', 'seagreen','2e8b57', 'seashell','fff5ee', 'sienna','a0522d', 'silver','c0c0c0', 126 'skyblue','87ceeb', 'slateblue','6a5acd', 'slategray','708090', 'snow','fffafa', 'springgreen','00ff7f', 'steelblue','4682b4', 127 'tan','d2b48c', 'teal','008080', 'thistle','d8bfd8', 'tomato','ff6347', 'turquoise','40e0d0', 'violet','ee82ee', 'wheat','f5deb3', 128 'white','ffffff', 'whitesmoke','f5f5f5', 'yellow','ffff00', 'yellowgreen','9acd32' }; # PROFILE BLOCK STOP 129 130 $self->{content_type__} = ''; 131 $self->{base64__} = ''; 132 $self->{in_html_tag__} = 0; 133 $self->{html_tag__} = ''; 134 $self->{html_arg__} = ''; 135 $self->{in_headers__} = 0; 136 $self->{first20__} = ''; 137 138 return bless $self, $type; 139 } 140 141 # --------------------------------------------------------------------------------------------- 142 # 143 # compute_rgb_distance 144 # 145 # Given two RGB colors compute the distance between them by considering them as points 146 # in 3 dimensions and calculating the distance between them (or equivalently the length 147 # of a vector between them) 148 # 149 # $left One color 150 # $right The other color 151 # 152 # --------------------------------------------------------------------------------------------- 153 sub compute_rgb_distance 154 { 155 my ( $self, $left, $right ) = @_; 156 157 # Figure out where the left color is and then subtract the right 158 # color (point from it) to get the vector 159 160 $left =~ /^(..)(..)(..)$/; 161 my ( $rl, $gl, $bl ) = ( hex($1), hex($2), hex($3) ); 162 163 $right =~ /^(..)(..)(..)$/; 164 my ( $r, $g, $b ) = ( $rl - hex($1), $gl - hex($2), $bl - hex($3) ); 165 166 # Now apply Pythagoras in 3D to get the distance between them, we return 167 # the int because we don't need decimal level accuracy 168 169 return int( sqrt( $r*$r + $g*$g + $b*$b ) ); 170 } 171 172 # --------------------------------------------------------------------------------------------- 173 # 174 # compute_html_color_distance 175 # 176 # Calls compute_rgb_distance to set up htmlcolordistance__ from the current HTML back and 177 # font colors 178 # 179 # --------------------------------------------------------------------------------------------- 180 sub compute_html_color_distance 181 { 182 my ( $self ) = @_; 183 184 $self->{htmlcolordistance__} = $self->compute_rgb_distance( $self->{htmlfontcolor__}, $self->{htmlbackcolor__} ); 185 } 186 187 # --------------------------------------------------------------------------------------------- 188 # 189 # map_color 190 # 191 # Convert an HTML color value into its canonical lower case hexadecimal form with no # 192 # 193 # $color A color value found in a tag 194 # 195 # --------------------------------------------------------------------------------------------- 196 sub map_color 197 { 198 my ( $self, $color ) = @_; 199 200 # The canonical form is lowercase hexadecimal, so start by lowercasing and stripping any 201 # initial # 202 203 $color = lc( $color ); 204 $color =~ s/^#//; 205 206 # Map color names to hexadecimal values 207 208 if ( defined( $self->{color_map__}{$color} ) ) { 209 return $self->{color_map__}{$color}; 210 } else { 211 return $color; 212 } 213 } 214 215 # --------------------------------------------------------------------------------------------- 216 # 217 # increment_word 218 # 219 # Updates the word frequency for a word without performing any coloring or transformation 220 # on the word 221 # 222 # $word The word 223 # 224 # --------------------------------------------------------------------------------------------- 225 sub increment_word 226 { 227 my ($self, $word) = @_; 228 229 $self->{words__}{$word} += 1; 230 $self->{msg_total__} += 1; 231 232 print "--- $word ($self->{words__}{$word})\n" if ($self->{debug__}); 233 } 234 235 # --------------------------------------------------------------------------------------------- 236 # 237 # update_pseudoword 238 # 239 # Updates the word frequency for a pseudoword, note that this differs from update_word 240 # because it does no word mangling 241 # 242 # $prefix The pseudoword prefix (e.g. header) 243 # $word The pseudoword (e.g. Mime-Version) 244 # $encoded Whether this was found inside encoded text 245 # $literal The literal text that generated this pseudoword 246 # 247 # --------------------------------------------------------------------------------------------- 248 sub update_pseudoword 249 { 250 my ( $self, $prefix, $word, $encoded, $literal ) = @_; 251 252 my $mword = "$prefix:$word"; 253 254 if ( $self->{color__} ) { 255 if ( $encoded == 1 ) { 256 $literal =~ s/</</g; 257 $literal =~ s/>/>/g; 258 my $color = $self->{bayes__}->get_color($mword); 259 my $to = "<b><font color=\"$color\"><a title=\"$mword\">$literal</a></font></b>"; 260 $self->{ut__} .= $to . ' '; 261 } 262 } 263 264 $self->increment_word( $mword ); 265 } 266 267 # --------------------------------------------------------------------------------------------- 268 # 269 # update_word 270 # 271 # Updates the word frequency for a word 272 # 273 # $word The word that is being updated 274 # $encoded 1 if the line was found in encoded text (base64) 275 # $before The character that appeared before the word in the original line 276 # $after The character that appeared after the word in the original line 277 # $prefix A string to prefix any words with in the corpus, used for the special 278 # identification of values found in for example the subject line 279 # 280 # --------------------------------------------------------------------------------------------- 281 sub update_word 282 { 283 my ($self, $word, $encoded, $before, $after, $prefix) = @_; 284 285 my $mword = $self->{mangle__}->mangle($word); 286 287 if ( $mword ne '' ) { 288 $mword = $prefix . ':' . $mword if ( $prefix ne '' ); 289 290 if ( $prefix =~ /(from|to|cc|subject)/i ) { 291 push @{$self->{quickmagnets__}{$prefix}}, $word; 292 } 293 294 if ( $self->{color__} ) { 295 my $color = $self->{bayes__}->get_color($mword); 296 if ( $encoded == 0 ) { 297 $after = '&' if ( $after eq '>' ); 298 if ( !( $self->{ut__} =~ s/($before)\Q$word\E($after)/$1<b><font color=\"$color\">$word<\/font><\/b>$2/ ) ) { 299 print "Could not find $word for colorization\n" if ( $self->{debug__} ); 300 } 301 } else { 302 $self->{ut__} .= "<font color=\"$color\">$word<\/font> "; 303 } 304 } 305 306 $self->increment_word( $mword ); 307 } 308 } 309 310 # --------------------------------------------------------------------------------------------- 311 # 312 # add_line 313 # 314 # Parses a single line of text and updates the word frequencies 315 # 316 # $bigline The line to split into words and add to the word counts 317 # $encoded 1 if the line was found in encoded text (base64) 318 # $prefix A string to prefix any words with in the corpus, used for the special 319 # identification of values found in for example the subject line 320 # 321 # --------------------------------------------------------------------------------------------- 322 sub add_line 323 { 324 my ($self, $bigline, $encoded, $prefix) = @_; 325 my $p = 0; 326 327 print "add_line: [$bigline]\n" if $self->{debug__}; 328 329 # If the line is really long then split at every 1k and feed it to the parser below 330 331 # Check the HTML back and font colors to ensure that we are not about to 332 # add words that are hidden inside invisible ink 333 334 if ( $self->{htmlfontcolor__} ne $self->{htmlbackcolor__} ) { 335 336 # If we are adding a line and the colors are different then we will 337 # add a count for the color difference to make sure that we catch 338 # camouflage attacks using similar colors, if the color similarity 339 # is less than 100. I chose 100 somewhat arbitrarily but classic 340 # black text on white background has a distance of 441, red/blue or 341 # green on white has distance 255. 100 seems like a reasonable upper 342 # bound for tracking evil spammer tricks with similar colors 343 344 $self->compute_html_color_distance(); 345 if ( $self->{htmlcolordistance__} < 100 ) { 346 $self->update_pseudoword( 'html', "colordistance$self->{htmlcolordistance__}", $encoded, '' ); 347 } 348 349 while ( $p < length($bigline) ) { 350 my $line = substr($bigline, $p, 1024); 351 352 # mangle up html character entities 353 # these are just the low ISO-Latin1 entities 354 # see: http://www.w3.org/TR/REC-html32#latin1 355 # TODO: find a way to make this (and other similar stuff) highlight 356 # without using the encoded content printer or modifying $self->{ut__} 357 358 while ( $line =~ m/(&(\w{3,6});)/g ) { 359 my $from = $1; 360 my $to = $entityhash{$2}; 361 362 if ( defined( $to ) ) { 363 $to = chr($to); 364 $line =~ s/$from/$to/g; 365 $self->{ut__} =~ s/$from/$to/g; 366 print "$from -> $to\n" if $self->{debug__}; 367 } 368 } 369 370 while ( $line =~ m/(&#([\d]{1,3});)/g ) { 371 372 # Don't decode odd (nonprintable) characters or < >'s. 373 374 if ( ( ( $2 < 255 ) && ( $2 > 63 ) ) || ( $2 == 61 ) || ( ( $2 < 60 ) && ( $2 > 31 ) ) ) { 375 my $from = $1; 376 my $to = chr($2); 377 378 if ( defined( $to ) && ( $to ne '' ) ) { 379 $line =~ s/$from/$to/g; 380 $self->{ut__} =~ s/$from/$to/g; 381 print "$from -> $to\n" if $self->{debug__}; 382 $self->update_pseudoword( 'html', 'numericentity', $encoded, $from ); 383 } 384 } 385 } 386 387 # Pull out any email addresses in the line that are marked with <> and have an @ in them 388 389 while ( $line =~ s/(mailto:)?([[:alpha:]0-9\-_\.]+?@([[:alpha:]0-9\-_\.]+\.[[:alpha:]0-9\-_]+))([\&\)\?\:\/ >\&\;])// ) { 390 update_word($self, $2, $encoded, ($1?$1:''), '[\&\?\:\/ >\&\;]', $prefix); 391 add_url($self, $3, $encoded, '\@', '[\&\?\:\/]', $prefix); 392 } 393 394 # Grab domain names 395 while ( $line =~ s/(([[:alpha:]0-9\-_]+\.)+)(com|edu|gov|int|mil|net|org|aero|biz|coop|info|museum|name|pro)([^[:alpha:]0-9\-_\.]|$)/$4/i ) { 396 add_url($self, "$1$3", $encoded, '', '', $prefix); 397 } 398 399 # Grab IP addresses 400 401 while ( $line =~ s/(([12]?\d{1,2}\.){3}[12]?\d{1,2})// ) { 402 update_word($self, "$1", $encoded, '', '', $prefix); 403 } 404 405 # Deal with runs of alternating spaces and letters 406 407 foreach my $space (' ', '\'', '*', '^', '`', ' ', '\38', '.' ){ 408 while ( $line =~ s/( |^)(([A-Z]\Q$space\E){2,15}[A-Z])( |\Q$space\E|[!\?,])/ /i ) { 409 my $original = "$1$2$4"; 410 my $word = $2; 411 print "$word ->" if $self->{debug__}; 412 $word =~ s/[^A-Z]//gi; 413 print "$word\n" if $self->{debug__}; 414 $self->update_word( $word, $encoded, ' ', ' ', $prefix); 415 $self->update_pseudoword( 'trick', 'spacedout', $encoded, $original ); 416 } 417 } 418 419 # Deal with random insertion of . inside words 420 421 while ( $line =~ s/ ([A-Z]+)\.([A-Z]{2,}) / $1$2 /i ) { 422 $self->update_pseudoword( 'trick', 'dottedwords', $encoded, "$1$2" ); 423 } 424 425 # Only care about words between 3 and 45 characters since short words like 426 # an, or, if are too common and the longest word in English (according to 427 # the OED) is pneumonoultramicroscopicsilicovolcanoconiosis 428 429 while ( $line =~ s/([[:alpha:]][[:alpha:]\']{1,44})([_\-,\.\"\'\)\?!:;\/& \t\n\r]{0,5}|$)// ) { 430 if ( ( $self->{in_headers__} == 0 ) && ( $self->{first20count__} < 20 ) ) { 431 $self->{first20count__} += 1; 432 $self->{first20__} .= " $1"; 433 } 434 435 update_word($self,$1, $encoded, '', '[_\-,\.\"\'\)\?!:;\/ &\t\n\r]', $prefix) if (length $1 >= 3); 436 } 437 438 $p += 1024; 439 } 440 } else { 441 if ( $bigline =~ /[^ \t]/ ) { 442 $self->update_pseudoword( 'trick', 'invisibleink', $encoded, $bigline ); 443 } 444 } 445 } 446 447 # --------------------------------------------------------------------------------------------- 448 # 449 # update_tag 450 # 451 # Extract elements from within HTML tags that are considered important 'words' for analysis 452 # such as domain names, alt tags, 453 # 454 # $tag The tag name 455 # $arg The arguments 456 # $end_tag Whether this is an end tag or not 457 # $encoded 1 if this HTML was found inside encoded (base64) text 458 # 459 # --------------------------------------------------------------------------------------------- 460 sub update_tag 461 { 462 my ( $self, $tag, $arg, $end_tag, $encoded ) = @_; 463 464 $tag =~ s/[\r\n]//g; 465 $arg =~ s/[\r\n]//g; 466 467 print "HTML tag $tag with argument " . $arg . "\n" if ($self->{debug__}); 468 469 # End tags do not require any argument decoding but we do look at them 470 # to make sure that we handle /font to change the font color 471 472 if ( $end_tag ) { 473 if ( $tag =~ /^font$/i ) { 474 $self->{htmlfontcolor__} = map_color( $self, 'black' ); 475 $self->compute_html_color_distance(); 476 } 477 478 # If we hit a table tag then any font information is lost 479 480 if ( $tag =~ /^(table|td|tr|th)$/i ) { 481 $self->{htmlfontcolor__} = map_color( $self, 'black' ); 482 $self->{htmlbackcolor__} = $self->{htmlbodycolor__}; 483 $self->compute_html_color_distance(); 484 } 485 486 return; 487 } 488 489 # Count the number of TD elements 490 $self->update_pseudoword('html', 'td', $encoded, $tag ) if ( $tag =~ /^td$/i ); 491 492 my $attribute; 493 my $value; 494 495 # These are used to pass good values to update_word 496 497 my $quote; 498 my $end_quote; 499 500 # Strip the first attribute while there are any attributes 501 # Match the closing attribute character, if there is none 502 # (this allows nested single/double quotes), 503 # match a space or > or EOL 504 505 my $original; 506 507 while ( $arg =~ s/[ \t]*((\w+)[ \t]*=[ \t]*([\"\'])?(.*?)(\3|($|([ \t>]))))//i ) { 508 $original = $1; 509 $attribute = $2; 510 $value = $4; 511 $quote = ''; 512 $end_quote = '[\> \t\&\n]'; 513 if (defined $3) { 514 $quote = $3; 515 $end_quote = $3; 516 } 517 518 print " attribute $attribute with value $quote$value$quote\n" if ($self->{debug__}); 519 520 # Remove leading whitespace and leading value-less attributes 521 522 if ( $arg =~ s/^(([ \t]*(\w+)[\t ]+)+)([^=])/$4/ ) { 523 print " attribute(s) " . $1 . " with no value\n" if ($self->{debug__}); 524 } 525 526 # Toggle for parsing script URI's. 527 # Should be left off (0) until more is known about how different html 528 # rendering clients behave. 529 530 my $parse_script_uri = 0; 531 532 # Tags with src attributes 533 534 if ( ( $attribute =~ /^src$/i ) && # PROFILE BLOCK START 535 ( ( $tag =~ /^img|frame|iframe$/i ) 536 || ( $tag =~ /^script$/i && $parse_script_uri ) ) ) { # PROFILE BLOCK STOP 537 538 # "CID:" links refer to an origin-controlled attachment to a html email. 539 # Adding strings from these, even if they appear to be hostnames, may or 540 # may not be beneficial 541 542 if ($value =~ /^cid\:/i ) 543 { 544 # TODO: Decide what to do here, ignoring CID's for now 545 } else { 546 547 my $host = add_url( $self, $value, $encoded, $quote, $end_quote, '', 1 ); 548 549 # If the host name is not blank (i.e. there was a hostname in the url 550 # and it was an image, then if the host was not this host then report 551 # an off machine image 552 553 if ( ( $host ne '' ) && ( $tag =~ /^img$/i ) ) { 554 if ( $host ne 'localhost' ) { 555 $self->update_pseudoword( 'html', 'imgremotesrc', $encoded, $original ); 556 } 557 } 558 559 next; 560 } 561 562 add_url( $self, $value, $encoded, $quote, $end_quote, '' ); 563 next; 564 } 565 566 # Tags with href attributes 567 568 if ( $attribute =~ /^href$/i && $tag =~ /^(a|link|base|area)$/i ) { 569 570 # Look for mailto:'s 571 572 if ($value =~ /^mailto:/i) { 573 if ( $tag =~ /^a$/ && $value =~ /^mailto:([[:alpha:]0-9\-_\.]+?@([[:alpha:]0-9\-_\.]+?))([>\&\?\:\/]|$)/i ) { 574 update_word( $self, $1, $encoded, 'mailto:', ($3?'[\\\>\&\?\:\/]':$end_quote), '' ); 575 add_url( $self, $2, $encoded, '@', ($3?'[\\\&\?\:\/]':$end_quote), '' ); 576 } 577 } else { 578 579 # Anything that isn't a mailto is probably an URL 580 581 $self->add_url($value, $encoded, $quote, $end_quote, ''); 582 } 583 584 next; 585 } 586 587 # Tags with alt attributes 588 589 if ( $attribute =~ /^alt$/i && $tag =~ /^img$/i ) { 590 add_line($self, $value, $encoded, ''); 591 next; 592 } 593 594 # Tags with working background attributes 595 596 if ( $attribute =~ /^background$/i && $tag =~ /^(td|table|body)$/i ) { 597 add_url( $self, $value, $encoded, $quote, $end_quote, '' ); 598 next; 599 } 600 601 # Tags that load sounds 602 603 if ( $attribute =~ /^bgsound$/i && $tag =~ /^body$/i ) { 604 add_url( $self, $2, $encoded, $quote, $end_quote, '' ); 605 next; 606 } 607 608 609 # Tags with colors in them 610 611 if ( ( $attribute =~ /^color$/i ) && ( $tag =~ /^font$/i ) ) { 612 update_word( $self, $value, $encoded, $quote, $end_quote, '' ); 613 $self->update_pseudoword( 'html', "fontcolor$value", $encoded, $original ); 614 $self->{htmlfontcolor__} = map_color($self, $value); 615 $self->compute_html_color_distance(); 616 print "Set html font color to $self->{htmlfontcolor__}\n" if ( $self->{debug__} ); 617 } 618 619 if ( ( $attribute =~ /^text$/i ) && ( $tag =~ /^body$/i ) ) { 620 $self->update_pseudoword( 'html', "fontcolor$value", $encoded, $original ); 621 update_word( $self, $value, $encoded, $quote, $end_quote, '' ); 622 $self->{htmlfontcolor__} = map_color($self, $value); 623 $self->compute_html_color_distance(); 624 print "Set html font color to $self->{htmlfontcolor__}\n" if ( $self->{debug__} ); 625 } 626 627 # The width and height of images 628 629 if ( ( $attribute =~ /^(width|height)$/i ) && ( $tag =~ /^img$/i ) ) { 630 $attribute = lc( $attribute ); 631 $self->update_pseudoword( 'html', "img$attribute$value", $encoded, $original ); 632 } 633 634 # Font sizes 635 636 if ( ( $attribute =~ /^size$/i ) && ( $tag =~ /^font$/i ) ) { 637 $self->update_pseudoword( 'html', "fontsize$value", $encoded, $original ); 638 } 639 640 # Tags with background colors 641 642 if ( ( $attribute =~ /^(bgcolor|back)$/i ) && ( $tag =~ /^(td|table|body|tr|th|font)$/i ) ) { 643 update_word( $self, $value, $encoded, $quote, $end_quote, '' ); 644 $self->update_pseudoword( 'html', "backcolor$value" ); 645 $self->{htmlbackcolor__} = map_color($self, $value); 646 print "Set html back color to $self->{htmlbackcolor__}\n" if ( $self->{debug__} ); 647 648 $self->{htmlbodycolor__} = $self->{htmlbackcolor__} if ( $tag =~ /^body$/i ); 649 $self->compute_html_color_distance(); 650 } 651 652 # Tags with a charset 653 654 if ( ( $attribute =~ /^content$/i ) && ( $tag =~ /^meta$/i ) ) { 655 if ( $value=~ /charset=([^ ]{1,40})[\"\>]?/ ) { 656 update_word( $self, $1, $encoded, '', '', '' ); 657 } 658 } 659 660 # Tags with style attributes (this one may impact performance!!!) 661 # most container tags accept styles, and the background style may 662 # not be in a predictable location (search the entire value) 663 664 if ( $attribute =~ /^style$/i && $tag =~ /^(body|td|tr|table|span|div|p)$/i ) { 665 add_url( $self, $1, $encoded, '[\']', '[\']', '' ) if ( $value =~ /background\-image:[ \t]?url\([ \t]?\'(.*)\'[ \t]?\)/i ); 666 next; 667 } 668 669 # Tags with action attributes 670 671 if ( $attribute =~ /^action$/i && $tag =~ /^form$/i ) { 672 if ( $value =~ /^(ftp|http|https):\/\//i ) { 673 add_url( $self, $value, $encoded, $quote, $end_quote, '' ); 674 next; 675 } 676 677 # mailto forms 678 679 if ( $value =~ /^mailto:([[:alpha:]0-9\-_\.]+?@([[:alpha:]0-9\-_\.]+?))([>\&\?\:\/])/i ) { 680 update_word( $self, $1, $encoded, 'mailto:', ($3?'[\\\>\&\?\:\/]':$end_quote), '' ); 681 add_url( $self, $2, $encoded, '@', ($3?'[\\\>\&\?\:\/]':$end_quote), '' ); 682 } 683 next; 684 } 685 } 686 } 687 688 # --------------------------------------------------------------------------------------------- 689 # 690 # add_url 691 # 692 # Parses a single url or domain and identifies interesting parts 693 # 694 # $url the domain name to handle 695 # $encoded 1 if the domain was found in encoded text (base64) 696 # $before The character that appeared before the URL in the original line 697 # $after The character that appeared after the URL in the original line 698 # $prefix A string to prefix any words with in the corpus, used for the special 699 # identification of values found in for example the subject line 700 # $noadd If defined indicates that only parsing should be done, no word updates 701 # 702 # Returns the hostname 703 # 704 # --------------------------------------------------------------------------------------------- 705 sub add_url 706 { 707 my ($self, $url, $encoded, $before, $after, $prefix, $noadd) = @_; 708 709 my $temp_url = $url; 710 my $temp_before; 711 my $temp_after; 712 my $hostform; #ip or name 713 714 # parts of a URL, from left to right 715 my $protocol; 716 my $authinfo; 717 my $host; 718 my $port; 719 my $path; 720 my $query; 721 my $hash; 722 723 # Strip the protocol part of a URL (e.g. http://) 724 725 $protocol = $1 if ( $url =~ s/^([^:]*)\:\/\/// ); 726 727 # Remove any URL encoding (protocol may not be URL encoded) 728 729 my $oldurl = $url; 730 my $percents = ( $url =~ s/(%([0-9A-Fa-f]{2}))/chr(hex("0x$2"))/ge ); 731 732 if ( $percents > 0 ) { 733 $self->update_pseudoword( 'html', 'encodedurl', $encoded, $oldurl ) if ( !defined( $noadd ) ); 734 } 735 736 # Extract authorization information from the URL (e.g. http://foo@bar.com) 737 738 $authinfo = $1 if ( $url =~ s/^(([[:alpha:]0-9\-_\.\;\:\&\=\+\$\,]+)(\@|\%40))+// ); 739 740 $self->update_pseudoword( 'html', 'authorization', $encoded, $oldurl ) if ( defined( $authinfo ) && ( $authinfo ne '' ) ); 741 742 if ( $url =~ s/^(([[:alpha:]0-9\-_]+\.)+)(com|edu|gov|int|mil|net|org|aero|biz|coop|info|museum|name|pro|[[:alpha:]]{2})([^[:alpha:]0-9\-_\.]|$)/$4/i ) { 743 $host = "$1$3"; 744 $hostform = "name"; 745 } else { 746 if ( $url =~ /(([^:\/])+)/ ) { 747 748 # Some other hostname format found, maybe 749 # Read here for reference: http://www.pc-help.org/obscure.htm 750 # Go here for comparison: http://www.samspade.org/t/url 751 752 # save the possible hostname 753 754 my $host_candidate = $1; 755 756 # stores discovered IP address 757 758 my %quads; 759 760 # temporary values 761 762 my $quad = 1; 763 my $number; 764 765 # iterate through the possible hostname, build dotted quad format 766 767 while ($host_candidate =~ s/\G^((0x)[0-9A-Fa-f]+|0[0-7]+|[0-9]+)(\.)?//) { 768 my $hex = $2; 769 770 # possible IP quad(s) 771 772 my $quad_candidate = $1; 773 my $more_dots = $3; 774 775 if (defined $hex) { 776 777 # hex number 778 # trim arbitrary octets that are greater than most significant bit 779 780 $quad_candidate =~ s/.*(([0-9A-F][0-9A-F]){4})$/$1/i; 781 $number = hex( $quad_candidate ); 782 } else { 783 if ( $quad_candidate =~ /^0([0-7]+)/ ) { 784 785 # octal number 786 787 $number = oct($1); 788 } else { 789 790 # assume decimal number 791 # deviates from the obscure.htm document here, no current browsers overflow 792 793 $number = int($quad_candidate); 794 } 795 } 796 797 # No more IP dots? 798 799 if ( !defined( $more_dots ) ) { 800 801 # Expand final decimal/octal/hex to extra quads 802 803 while ( $quad <= 4 ) { 804 my $shift = ((4 - $quad) * 8); 805 $quads{$quad} = ($number & (hex("0xFF") << $shift) ) >> $shift; 806 $quad += 1; 807 } 808 } else { 809 810 # Just plug the quad in, no overflow allowed 811 812 $quads{$quad} = $number if ($number < 256); 813 $quad += 1; 814 } 815 816 last if ( $quad > 4 ); 817 } 818 819 $host_candidate =~ s/\r|\n|$//g; 820 if ( ( $host_candidate eq '' ) && # PROFILE BLOCK START 821 defined( $quads{1} ) && 822 defined( $quads{2} ) && 823 defined( $quads{3} ) && 824 defined( $quads{4} ) && 825 !defined( $quads{5} ) ) { # PROFILE BLOCK STOP 826 827 # we did actually find an IP address, and not some fake 828 829 $hostform = "ip"; 830 $host = "$quads{1}.$quads{2}.$quads{3}.$quads{4}"; 831 $url =~ s/(([^:\/])+)//; 832 } 833 } 834 } 835 836 if ( !defined( $host ) || ( $host eq '' ) ) { 837 print "no hostname found: [$temp_url]\n" if ($self->{debug__}); 838 return ''; 839 } 840 841 $port = $1 if ( $url =~ s/^\:(\d+)//); 842 $path = $1 if ( $url =~ s/^([\\\/][^\#\?\n]*)($)?// ); 843 $query = $1 if ( $url =~ s/^[\?]([^\#\n]*|$)?// ); 844 $hash = $1 if ( $url =~ s/^[\#](.*)$// ); 845 846 if ( !defined( $protocol ) || ( $protocol =~ /^(http|https)$/ ) ) { 847 $temp_before = $before; 848 $temp_before = "\:\/\/" if (defined $protocol); 849 $temp_before = "[\@]" if (defined $authinfo); 850 851 $temp_after = $after; 852 $temp_after = "[\#]" if (defined $hash); 853 $temp_after = "[\?]" if (defined $query); 854 $temp_after = "[\\\\\/]" if (defined $path); 855 $temp_after = "[\:]" if (defined $port); 856 857 update_word( $self, $host, $encoded, $temp_before, $temp_after, $prefix) if ( !defined( $noadd ) ); 858 859 # decided not to care about tld's beyond the verification performed when 860 # grabbing $host 861 # special subTLD's can just get their own classification weight (eg, .bc.ca) 862 # http://www.0dns.org has a good reference of ccTLD's and their sub-tld's if desired 863 864 if ( $hostform eq "name" ) { 865 while ( $host =~ s/^([^\.])+\.(.*\.(.*))$/$2/ ) { 866 update_word( $self, $2, $encoded, '[\.]', '[<]', $prefix) if ( !defined( $noadd ) ); 867 } 868 } 869 } 870 871 # $protocol $authinfo $host $port $query $hash may be processed below if desired 872 return $host; 873 } 874 875 # --------------------------------------------------------------------------------------------- 876 # 877 # parse_html 878 # 879 # Parse a line that might contain HTML information, returns 1 if we are still inside an 880 # unclosed HTML tag 881 # 882 # $line A line of text 883 # $encoded 1 if this HTML was found inside encoded (base64) text 884 # 885 # --------------------------------------------------------------------------------------------- 886 sub parse_html 887 { 888 my ( $self, $line, $encoded ) = @_; 889 890 my $found = 1; 891 892 $line =~ s/[\r\n]+//gm; 893 894 print "parse_html: [$line] " . $self->{in_html_tag__} . "\n" if $self->{debug__}; 895 896 # Remove HTML comments and other tags that begin ! 897 898 while ( $line =~ s/(<!.*?>)// ) { 899 $self->update_pseudoword( 'html', 'comment', $encoded, $1 ); 900 print "$line\n" if $self->{debug__}; 901 } 902 903 while ( $found && ( $line ne '' ) ) { 904 $found = 0; 905 906 # If we are in an HTML tag then look for the close of the tag, if we get it then 907 # handle the tag, if we don't then keep building up the arguments of the tag 908 909 if ( $self->{in_html_tag__} ) { 910 if ( $line =~ s/^(.*?)>// ) { 911 $self->{html_arg__} .= $1; 912 $self->{in_html_tag__} = 0; 913 $self->{html_tag__} =~ s/=\n ?//g; 914 $self->{html_arg__} =~ s/=\n ?//g; 915 update_tag( $self, $self->{html_tag__}, $self->{html_arg__}, $self->{html_end}, $encoded ); 916 $self->{html_tag__} = ''; 917 $self->{html_arg__} = ''; 918 $found = 1; 919 next; 920 } else { 921 $self->{html_arg__} .= $line; 922 return 1; 923 } 924 } 925 926 # Does the line start with a HTML tag that is closed (i.e. has both the < and the 927 # > present)? If so then handle that tag immediately and continue 928 929 if ( $line =~ s/^<([\/]?)([A-Za-z]+)([^>]*?)>// ) { 930 update_tag( $self, $2, $3, ( $1 eq '/' ), $encoded ); 931 $found = 1; 932 next; 933 } 934 935 # Does the line consist of just a tag that has no closing > then set up the global 936 # vars that record the tag and return 1 to indicate to the caller that we have an 937 # unclosed tag 938 939 if ( $line =~ /^<([\/]?)([^ >]+)([^>]*)$/ ) { 940 $self->{html_end} = ( $1 eq '/' ); 941 $self->{html_tag__} = $2; 942 $self->{html_arg__} = $3; 943 $self->{in_html_tag__} = 1; 944 return 1; 945 } 946 947 # There could be something on the line that needs parsing (such as a word), if we reach here 948 # then we are not in an unclosed tag and so we can grab everything from the start of the line 949 # to the end or the first < and pass it to the line parser 950 951 if ( $line =~ s/^([^<]+)(<|$)/$2/ ) { 952 $found = 1; 953 add_line( $self, $1, $encoded, '' ); 954 } 955 } 956 957 return 0; 958 } 959 960 # --------------------------------------------------------------------------------------------- 961 # 962 # parse_file 963 # 964 # Read messages from file and parse into a list of words and frequencies, returns a colorized 965 # HTML version of message if color__ is set 966 # 967 # $file The file to open and parse 968 # 969 # --------------------------------------------------------------------------------------------- 970 sub parse_file 971 { 972 my ( $self, $file ) = @_; 973 974 $self->start_parse(); 975 976 open MSG, "<$file"; 977 binmode MSG; 978 979 # Read each line and find each "word" which we define as a sequence of alpha 980 # characters 981 982 while (<MSG>) { 983 $self->parse_line( $_ ); 984 } 985 986 close MSG; 987 988 $self->stop_parse(); 989 990 if ( $self->{color__} ) { 991 $self->{colorized__} .= $self->{ut__} if ( $self->{ut__} ne '' ); 992 993 $self->{colorized__} .= "</tt>"; 994 $self->{colorized__} =~ s/(\r\n\r\n|\r\r|\n\n)/__BREAK____BREAK__/g; 995 $self->{colorized__} =~ s/[\r\n]+/__BREAK__/g; 996 $self->{colorized__} =~ s/__BREAK__/<br \/>/g; 997 998 return $self->{colorized__}; 999 } else { 1000 return ''; 1001 } 1002 } 1003 1004 # --------------------------------------------------------------------------------------------- 1005 # 1006 # start_parse 1007 # 1008 # Called to reset internal variables before parsing. This is automatically called when using 1009 # the parse_file API, and must be called before the first call to parse_line. 1010 # 1011 # --------------------------------------------------------------------------------------------- 1012 sub start_parse 1013 { 1014 my ( $self ) = @_; 1015 1016 # This will contain the mime boundary information in a mime message 1017 1018 $self->{mime__} = ''; 1019 1020 # Contains the encoding for the current block in a mime message 1021 1022 $self->{encoding__} = ''; 1023 1024 # Variables to save header information to while parsing headers 1025 1026 $self->{header__} = ''; 1027 $self->{argument__} = ''; 1028 1029 # Clear the word hash 1030 1031 $self->{content_type__} = ''; 1032 1033 # Base64 attachments are loaded into this as we read them 1034 1035 $self->{base64__} = ''; 1036 1037 # Variable to note that the temporary colorized storage is "frozen", 1038 # and what type of freeze it is (allows nesting of reasons to freeze 1039 # colorization) 1040 1041 $self->{in_html_tag__} = 0; 1042 1043 $self->{html_tag__} = ''; 1044 $self->{html_arg__} = ''; 1045 1046 $self->{words__} = {}; 1047 $self->{msg_total__} = 0; 1048 $self->{from__} = ''; 1049 $self->{to__} = ''; 1050 $self->{cc__} = ''; 1051 $self->{subject__} = ''; 1052 $self->{ut__} = ''; 1053 $self->{quickmagnets__} = {}; 1054 1055 $self->{htmlbackcolor__} = map_color( $self, 'white' ); 1056 $self->{htmlfontcolor__} = map_color( $self, 'black' ); 1057 $self->compute_html_color_distance(); 1058 1059 $self->{in_headers__} = 1; 1060 1061 $self->{first20__} = ''; 1062 $self->{first20count__} = 0; 1063 1064 # Used to return a colorize page 1065 1066 $self->{colorized__} = ''; 1067 $self->{colorized__} .= "<tt>" if ( $self->{color__} ); 1068 } 1069 1070 # --------------------------------------------------------------------------------------------- 1071 # 1072 # stop_parse 1073 # 1074 # Called at the end of a parse job. Automatically called if parse_file is used, must be 1075 # called after the last call to parse_line. 1076 # 1077 # --------------------------------------------------------------------------------------------- 1078 sub stop_parse 1079 { 1080 my ( $self ) = @_; 1081 1082 $self->{colorized__} .= $self->clear_out_base64(); 1083 1084 # If we reach here and discover that we think that we are in an unclosed HTML tag then there 1085 # has probably been an error (such as a < in the text messing things up) and so we dump 1086 # whatever is stored in the HTML tag out 1087 1088 if ( $self->{in_html_tag__} ) { 1089 $self->add_line( $self->{html_tag__} . ' ' . $self->{html_arg__}, 0, '' ); 1090 } 1091 1092 $self->{in_html_tag__} = 0; 1093 } 1094 1095 # --------------------------------------------------------------------------------------------- 1096 # 1097 # parse_line 1098 # 1099 # Called to parse a single line from a message. If using this API directly then be sure 1100 # to call start_parse before the first call to parse_line. 1101 # 1102 # $line Line of file to parse 1103 # 1104 # --------------------------------------------------------------------------------------------- 1105 sub parse_line 1106 { 1107 my ( $self, $read ) = @_; 1108 1109 if ( $read ne '' ) { 1110 1111 # For the Mac we do further splitting of the line at the CR characters 1112 1113 while ( $read =~ s/(.*?)[\r\n]+// ) { 1114 my $line = "$1\r\n"; 1115 1116 next if ( !defined($line) ); 1117 1118 print ">>> $line" if $self->{debug__}; 1119 1120 if ($self->{color__}) { 1121 1122 if (!$self->{in_html_tag__}) { 1123 $self->{colorized__} .= $self->{ut__}; 1124 $self->{ut__} = ''; 1125 } 1126 1127 $self->{ut__} .= splitline($line, $self->{encoding__}); 1128 } 1129 1130 if ($self->{in_headers__}) { 1131 1132 # temporary colorization while in headers is handled within parse_header 1133 1134 $self->{ut__} = ''; 1135 1136 # Check for blank line signifying end of headers 1137 1138 if ( $line =~ /^(\r\n|\r|\n)/) { 1139 1140 # Parse the last header 1141 ($self->{mime__},$self->{encoding__}) = $self->parse_header($self->{header__},$self->{argument__},$self->{mime__},$self->{encoding__}); 1142 1143 # Clear the saved headers 1144 $self->{header__} = ''; 1145 $self->{argument__} = ''; 1146 1147 $self->{ut__} .= splitline( "\015\012", 0 ); 1148 1149 $self->{in_headers__} = 0; 1150 print "Header parsing complete.\n" if $self->{debug__}; 1151 1152 next; 1153 } 1154 1155 1156 # If we have an email header then just keep the part after the : 1157 1158 if ( $line =~ /^([A-Za-z-]+):[ \t]*([^\n\r]*)/ ) { 1159 1160 # Parse the last header 1161 1162 ($self->{mime__},$self->{encoding__}) = $self->parse_header($self->{header__},$self->{argument__},$self->{mime__},$self->{encoding__}) if ($self->{header__} ne ''); 1163 1164 # Save the new information for the current header 1165 1166 $self->{header__} = $1; 1167 $self->{argument__} = $2; 1168 next; 1169 } 1170 1171 # Append to argument if the next line begins with whitespace (isn't a new header) 1172 1173 if ( $line =~ /^([\t ].*?)(\r\n|\r|\n)/ ) { 1174 $self->{argument__} .= "\015\012" . $1; 1175 } 1176 next; 1177 } 1178 1179 # If we are in a mime document then spot the boundaries 1180 1181 if ( ( $self->{mime__} ne '' ) && ( $line =~ /^\-\-($self->{mime__})(\-\-)?/ ) ) { 1182 1183 # approach each mime part with fresh eyes 1184 1185 $self->{encoding__} = ''; 1186 1187 if (!defined $2) { 1188 print "Hit MIME boundary --$1\n" if $self->{debug__}; 1189 $self->{in_headers__} = 1; 1190 } else { 1191 1192 $self->{in_headers__} = 0; 1193 1194 my $boundary = $1; 1195 1196 print "Hit MIME boundary terminator --$1--\n" if $self->{debug__}; 1197 1198 # escape to match escaped boundary characters 1199 1200 $boundary =~ s/(.*)/\Q$1\E/g; 1201 1202 my $temp_mime; 1203 1204 foreach my $aboundary (split(/\|/,$self->{mime__})) { 1205 if ($boundary ne $aboundary) { 1206 if (defined $temp_mime) { 1207 $temp_mime = join('|', $temp_mime, $aboundary); 1208 } else { 1209 $temp_mime = $aboundary 1210 } 1211 } 1212 } 1213 1214 $self->{mime__} = ($temp_mime || ''); 1215 1216 print "MIME boundary list now $self->{mime__}\n" if $self->{debug__}; 1217 $self->{in_headers__} = 0; 1218 } 1219 1220 next; 1221 } 1222 1223 # If we are still in the headers then make sure that we are on a line with whitespace 1224 # at the start 1225 1226 if ( $self->{in_headers__} ) { 1227 if ( $line =~ /^[ \t\r\n]/ ) { 1228 next; 1229 } 1230 } 1231 1232 # If we are doing base64 decoding then look for suitable lines and remove them 1233 # for decoding 1234 1235 if ( $self->{encoding__} =~ /base64/i ) { 1236 $line =~ s/[\r\n]//g; 1237 $line =~ s/!$//; 1238 $self->{base64__} .= $line; 1239 1240 next; 1241 } 1242 1243 next if ( !defined($line) ); 1244 1245 # Look for =?foo? syntax that identifies a charset 1246 1247 if ( $line =~ /=\?([^ ]{1,40})\?/ ) { 1248 update_word( $self, $1, 0, '', '', 'charset' ); 1249 } 1250 1251 # Decode quoted-printable 1252 1253 if ( $self->{encoding__} =~ /quoted\-printable/i ) { 1254 $line = decode_qp( $line ); 1255 $line =~ s/[\r\n]+$//g; 1256 $self->{ut__} = decode_qp( $self->{ut__} ) if ( $self->{color__} ); 1257 } 1258 1259 parse_html( $self, $line, 0 ); 1260 } 1261 } 1262 } 1263 1264 # --------------------------------------------------------------------------------------------- 1265 # 1266 # clear_out_base64 1267 # 1268 # If there's anything in the {base64__} then decode it and parse it, returns colorization 1269 # information to be added to the colorized output 1270 # 1271 # --------------------------------------------------------------------------------------------- 1272 sub clear_out_base64 1273 { 1274 my ( $self ) = @_; 1275 1276 my $colorized = ''; 1277 1278 if ( $self->{base64__} ne '' ) { 1279 my $decoded = ''; 1280 1281 $self->{ut__} = '' if $self->{color__}; 1282 $self->{base64__} =~ s/ //g; 1283 1284 print "Base64 data: " . $self->{base64__} . "\n" if ($self->{debug__}); 1285 1286 $decoded = decode_base64( $self->{base64__} ); 1287 $self->parse_html( $decoded, 1 ); 1288 1289 print "Decoded: " . $decoded . "\n" if ($self->{debug__}); 1290 1291 $self->{ut__} = "<b>Found in encoded data:</b> " . $self->{ut__} if ( $self->{color__} ); 1292 1293 if ( $self->{color__} ) { 1294 if ( $self->{ut__} ne '' ) { 1295 $colorized = $self->{ut__}; 1296 $self->{ut__} = ''; 1297 } 1298 } 1299 } 1300 1301 $self->{base64__} = ''; 1302 1303 return $colorized; 1304 } 1305 1306 # --------------------------------------------------------------------------------------------- 1307 # 1308 # decode_string - Decode MIME encoded strings used in the header lines in email messages 1309 # 1310 # $mystring - The string that neeeds decode 1311 # 1312 # Return the decoded string, this routine recognizes lines of the form 1313 # 1314 # =?charset?[BQ]?text?= 1315 # 1316 # A B indicates base64 encoding, a Q indicates quoted printable encoding 1317 # --------------------------------------------------------------------------------------------- 1318 sub decode_string 1319 { 1320 # I choose not to use "$mystring = MIME::Base64::decode( $1 );" because some spam mails 1321 # have subjects like: "Subject: adjpwpekm =?ISO-8859-1?Q?=B2=E1=A4=D1=AB=C7?= dopdalnfjpw". 1322 # Therefore, it will be better to store the decoded text in a temporary variable and substitute 1323 # the original string with it later. Thus, this subroutine returns the real decoded result. 1324 1325 my ( $self, $mystring ) = @_; 1326 1327 my $decode_it = ''; 1328 1329 while ( $mystring =~ /=\?[\w-]+\?(B|Q)\?(.*?)\?=/ig ) { 1330 if ($1 eq "B") { 1331 $decode_it = decode_base64( $2 ); 1332 $mystring =~ s/=\?[\w-]+\?B\?(.*?)\?=/$decode_it/i; 1333 } else { 1334 if ($1 eq "Q") { 1335 $decode_it = $2; 1336 $decode_it =~ s/\_/=20/g; 1337 $decode_it = decode_qp( $decode_it ); 1338 $mystring =~ s/=\?[\w-]+\?Q\?(.*?)\?=/$decode_it/i; 1339 } 1340 } 1341 } 1342 1343 return $mystring; 1344 } 1345 1346 # --------------------------------------------------------------------------------------------- 1347 # 1348 # get_header - Returns the value of the from, to, subject or cc header 1349 # 1350 # $header Name of header to return (note must be lowercase) 1351 # 1352 # --------------------------------------------------------------------------------------------- 1353 sub get_header 1354 { 1355 my ( $self, $header ) = @_; 1356 1357 return $self->{$header . '__'}; 1358 } 1359 1360 1361 # --------------------------------------------------------------------------------------------- 1362 # 1363 # parse_header - Performs parsing operations on a message header 1364 # 1365 # $header Name of header being processed 1366 # $argument Value of header being processed 1367 # $mime The presently saved mime boundaries list 1368 # $encoding Current message encoding 1369 # 1370 # --------------------------------------------------------------------------------------------- 1371 sub parse_header 1372 { 1373 my ($self, $header, $argument, $mime, $encoding) = @_; 1374 1375 print "Header ($header) ($argument)\n" if ($self->{debug__}); 1376 1377 if ( $self->{color__} ) { 1378 my $color = $self->{bayes__}->get_color( "header:$header" ); 1379 1380 my $fix_argument = $argument; 1381 $fix_argument =~ s/</</g; 1382 $fix_argument =~ s/>/>/g; 1383 1384 $self->{ut__} = "<b><font color=\"$color\">$header</font></b>: $fix_argument\015\012"; 1385 } 1386 1387 # After a discussion with Tim Peters and some looking at emails 1388 # I'd received I discovered that the header names (case sensitive) are 1389 # very significant in identifying different types of mail, for example 1390 # much spam uses MIME-Version, MiME-Version and Mime-Version 1391 1392 $self->update_pseudoword( 'header', $header, 0, $header ); 1393 1394 # Check the encoding type in all RFC 2047 encoded headers 1395 1396 if ( $argument =~ /=\?([^ ]{1,40})\?(Q|B)/i ) { 1397 update_word( $self, $1, 0, '', '', 'charset' ); 1398 } 1399 1400 # Handle the From, To and Cc headers and extract email addresses 1401 # from them and treat them as words 1402 1403 # For certain headers we are going to mark them specially in the corpus 1404 # by tagging them with where they were found to help the classifier 1405 # do a better job. So if you have 1406 # 1407 # From: foo@bar.com 1408 # 1409 # then we'll add from:foo@bar.com to the corpus and not just foo@bar.com 1410 1411 my $prefix = ''; 1412 1413 if ( $header =~ /^(From|To|Cc|Reply\-To)$/i ) { 1414 1415 # These headers at least can be decoded 1416 1417 $argument = $self->decode_string( $argument ); 1418 1419 if ( $argument =~ /=\?([^ ]{1,40})\?/ ) { 1420 update_word( $self, $1, 0, '', '', 'charset' ); 1421 } 1422 1423 if ( $header =~ /^From$/i ) { 1424 $self->{from__} = $argument if ( $self->{from__} eq '' ) ; 1425 $prefix = 'from'; 1426 push @{$self->{quickmagnets__}{$prefix}}, $argument if ($argument ne ''); 1427 } 1428 1429 if ( $header =~ /^To$/i ) { 1430 $prefix = 'to'; 1431 $self->{to__} = $argument if ( $self->{to__} eq '' ); 1432 } 1433 1434 if ( $header =~ /^Cc$/i ) { 1435 $prefix = 'cc'; 1436 $self->{cc__} = $argument if ( $self->{cc__} eq '' ); 1437 } 1438 1439 while ( $argument =~ s/<([[:alpha:]0-9\-_\.]+?@([[:alpha:]0-9\-_\.]+?))>// ) { 1440 update_word($self, $1, 0, ';', '&',$prefix); 1441 add_url($self, $2, 0, '@', '[&<]',$prefix); 1442 } 1443 1444 while ( $argument =~ s/([[:alpha:]0-9\-_\.]+?@([[:alpha:]0-9\-_\.]+))// ) { 1445 update_word($self, $1, 0, '', '',$prefix); 1446 add_url($self, $2, 0, '@', '',$prefix); 1447 } 1448 1449 add_line( $self, $argument, 0, $prefix ); 1450 return ($mime, $encoding); 1451 } 1452 1453 if ( $header =~ /^Subject$/i ) { 1454 $prefix = 'subject'; 1455 $argument = $self->decode_string( $argument ); 1456 $self->{subject__} = $argument if ( ( $self->{subject__} eq '' ) ); 1457 } 1458 1459 $self->{date__} = $argument if ( $header =~ /^Date$/i ); 1460 1461 # Look for MIME 1462 1463 if ( $header =~ /^Content-Type$/i ) { 1464 if ( $argument =~ /charset=\"?([^\" ]{1,40})\"?/ ) { 1465 update_word( $self, $1, 0, '' , '', 'charset' ); 1466 } 1467 1468 if ( $argument =~ /^(.*?)(;)/ ) { 1469 print "Set content type to $1\n" if $self->{debug__}; 1470 $self->{content_type__} = $1; 1471 } 1472 1473 if ( $argument =~ /multipart\//i ) { 1474 my $boundary = $argument; 1475 1476 if ( $boundary =~ /boundary= ?(\"([A-Z0-9\'\(\)\+\_\,\-\.\/\:\=\?][A-Z0-9\'\(\)\+_,\-\.\/:=\? ]{0,69})\"|([^\(\)\<\>\@\,\;\:\\\"\/\[\]\?\=]{1,70}))/i ) { 1477 1478 $boundary = ($2 || $3); 1479 1480 $boundary =~ s/(.*)/\Q$1\E/g; 1481 1482 if ($mime ne '') { 1483 1484 # Fortunately the pipe character isn't a valid mime boundary character! 1485 1486 $mime = join('|', $mime, $boundary); 1487 } else { 1488 $mime = $boundary; 1489 } 1490 print "Set mime boundary to " . $mime . "\n" if $self->{debug__}; 1491 return ($mime, $encoding); 1492 } 1493 } 1494 return ($mime, $encoding); 1495 } 1496 1497 # Look for the different encodings in a MIME document, when we hit base64 we will 1498 # do a special parse here since words might be broken across the boundaries 1499 1500 if ( $header =~ /^Content-Transfer-Encoding$/i ) { 1501 $encoding = $argument; 1502 print "Setting encoding to $encoding\n" if $self->{debug__}; 1503 my $compact_encoding = $encoding; 1504 $compact_encoding =~ s/[^A-Za-z0-9]//g; 1505 $self->update_pseudoword( 'encoding', $compact_encoding, 0, $encoding ); 1506 return ($mime, $encoding); 1507 } 1508 1509 # Some headers to discard 1510 1511 return ($mime, $encoding) if ( $header =~ /^(Thread-Index|X-UIDL|Message-ID|X-Text-Classification|X-Mime-Key)$/i ); 1512 1513 # Some headers should never be RFC 2047 decoded 1514 1515 $argument = $self->decode_string($argument) unless ($header =~ /^(Received|Content\-Type|Content\-Disposition)$/i); 1516 1517 add_line( $self, $argument, 0, $prefix ); 1518 1519 return ($mime, $encoding); 1520 } 1521 1522 1523 # --------------------------------------------------------------------------------------------- 1524 # 1525 # splitline - Escapes characters so a line will print as plain-text within a HTML document. 1526 # 1527 # $line The line to escape 1528 # $encoding The value of any current encoding scheme 1529 # 1530 # --------------------------------------------------------------------------------------------- 1531 1532 sub splitline 1533 { 1534 my ($line, $encoding) = @_; 1535 $line =~ s/([^\r\n]{100,120} )/$1\r\n/g; 1536 $line =~ s/([^ \r\n]{120})/$1\r\n/g; 1537 1538 $line =~ s/</</g; 1539 $line =~ s/>/>/g; 1540 1541 if ( $encoding =~ /quoted\-printable/i ) { 1542 $line =~ s/=3C/</g; 1543 $line =~ s/=3E/>/g; 1544 } 1545 1546 $line =~ s/\t/ /g; 1547 1548 return $line; 1549 } 1550 1551 # GETTERS/SETTERS 1552 1553 sub first20 1554 { 1555 my ( $self ) = @_; 1556 1557 return $self->{first20__}; 1558 } 1559 1560 sub quickmagnets 1561 { 1562 my ( $self ) = @_; 1563 1564 return $self->{quickmagnets__}; 1565 } 1566 1567 1; 1568 1569