1  package Classifier::MailParse;
    2  
    3  # ---------------------------------------------------------------------------------------------
    4  #
    5  # MailParse.pm --- Parse a mail message or messages into words
    6  #
    7  # Copyright (c) 2001-2003 John Graham-Cumming
    8  #
    9  # ---------------------------------------------------------------------------------------------
   10  
   11  use strict;
   12  use locale;
   13  use Classifier::WordMangle;
   14  
   15  use MIME::Base64;
   16  use MIME::QuotedPrint;
   17  
   18  # HTML entity mapping to character codes, this maps things like & to their corresponding
   19  # character code
   20  
   21  my %entityhash = ('aacute'  => 224,     'Aacute'  => 202,     'Acirc'   => 203,     'acirc'   => 225, # PROFILE BLOCK START
   22                    'acute'   => 189,     'AElig'   => 207,     'aelig'   => 229,     'Agrave'  => 201,
   23                    'agrave'  => 223,     'amp'     => 38,      'Aring'   => 206,     'aring'   => 228,
   24                    'atilde'  => 226,     'Atilde'  => 204,     'Auml'    => 196,     'auml'    => 228,
   25                    'brvbar'  => 166,     'ccedil'  => 230,     'Ccedil'  => 208,     'cedil'   => 193,
   26                    'cent'    => 162,     'copy'    => 169,     'curren'  => 164,     'deg'     => 185,
   27                    'divide'  => 246,     'Eacute'  => 210,     'eacute'  => 232,     'ecirc'   => 233,
   28                    'Ecirc'   => 211,     'Egrave'  => 209,     'egrave'  => 231,     'ETH'     => 217,
   29                    'eth'     => 239,     'Euml'    => 212,     'euml'    => 234,     'frac12'  => 198,
   30                    'frac14'  => 197,     'frac34'  => 199,     'iacute'  => 236,     'Iacute'  => 214,
   31                    'icirc'   => 237,     'Icirc'   => 215,     'iexcl'   => 161,     'igrave'  => 235,
   32                    'Igrave'  => 213,     'iquest'  => 200,     'iuml'    => 238,     'Iuml'    => 216,
   33                    'laquo'   => 180,     'macr'    => 184,     'micro'   => 190,     'middot'  => 192,
   34                    'nbsp'    => 160,     'not'     => 181,     'ntilde'  => 240,     'Ntilde'  => 218,
   35                    'oacute'  => 242,     'Oacute'  => 210,     'Ocirc'   => 211,     'ocirc'   => 243,
   36                    'Ograve'  => 219,     'ograve'  => 241,     'ordf'    => 170,     'ordm'    => 195,
   37                    'oslash'  => 247,     'Oslash'  => 215,     'Otilde'  => 212,     'otilde'  => 244,
   38                    'Ouml'    => 214,     'ouml'    => 246,     'para'    => 191,     'plusmn'  => 186,
   39                    'pound'   => 163,     'raquo'   => 196,     'reg'     => 183,     'sect'    => 167,
   40                    'shy'     => 182,     'sup1'    => 194,     'sup2'    => 187,     'sup3'    => 188,
   41                    'szlig'   => 223,     'thorn'   => 253,     'THORN'   => 221,     'times'   => 214,
   42                    'Uacute'  => 217,     'uacute'  => 249,     'ucirc'   => 250,     'Ucirc'   => 218,
   43                    'ugrave'  => 248,     'Ugrave'  => 216,     'uml'     => 168,     'Uuml'    => 220,
   44                    'uuml'    => 252,     'Yacute'  => 220,     'yacute'  => 252,     'yen'     => 165,
   45                    'yuml'    => 254); # PROFILE BLOCK STOP
   46  
   47  #----------------------------------------------------------------------------
   48  # new
   49  #
   50  # Class new() function
   51  #----------------------------------------------------------------------------
   52  sub new
   53  {
   54      my $type = shift;
   55      my $self;
   56  
   57      # Used to mangle words into the right shape for classification
   58  
   59      $self->{mangle__} = new Classifier::WordMangle;
   60  
   61      # Hash of word frequences
   62  
   63      $self->{words__}  = {};
   64  
   65      # Total word cout
   66  
   67      $self->{msg_total__} = 0;
   68  
   69      # Internal use for keeping track of a line without touching it
   70  
   71      $self->{ut__}        = '';
   72  
   73      # Specifies the parse mode, 1 means color the output
   74  
   75      $self->{color__}     = 0;
   76  
   77      # This will store the from, to, cc and subject from the last parse
   78      $self->{from__}      = '';
   79      $self->{to__}        = '';
   80      $self->{cc__}        = '';
   81      $self->{subject__}   = '';
   82  
   83      # This is used to store the words found in the from, to, and subject
   84      # lines for use in creating new magnets, it is a list of pairs mapping
   85      # a magnet type to a magnet string, e.g. from => popfile@jgc.org
   86  
   87      $self->{quickmagnets__}      = {};
   88  
   89      # These store the current HTML background color2 and font color to
   90      # detect "invisible ink" used by spammers
   91  
   92      $self->{htmlbackcolor__} = map_color( $self, 'white' );
   93      $self->{htmlbodycolor__} = map_color( $self, 'white' );
   94      $self->{htmlfontcolor__} = map_color( $self, 'black' );
   95  
   96      # This is the distance betwee the back color and the font color
   97      # as computed using compute_rgb_distance
   98  
   99      $self->{htmlcolordistance__} = 0;
  100  
  101      # This is a mapping between HTML color names and HTML hexadecimal color values used by the
  102      # map_color value to get canonical color values
  103  
  104      $self->{color_map__} = { 'aliceblue','f0f8ff', 'antiquewhite','faebd7', 'aqua','00ffff', 'aquamarine','7fffd4', 'azure','f0ffff', # PROFILE BLOCK START
  105          'beige','f5f5dc', 'bisque','ffe4c4', 'black','000000', 'blanchedalmond','ffebcd', 'blue','0000ff', 'blueviolet','8a2be2',
  106          'brown','a52a2a', 'burlywood','deb887', 'cadetblue','5f9ea0', 'chartreuse','7fff00', 'chocolate','d2691e', 'coral','ff7f50',
  107          'cornflowerblue','6495ed', 'cornsilk','fff8dc', 'crimson','dc143c', 'cyan','00ffff', 'darkblue','00008b', 'darkcyan','008b8b',
  108          'darkgoldenrod','b8860b', 'darkgray','a9a9a9', 'darkgreen','006400', 'darkkhaki','bdb76b', 'darkmagenta','8b008b', 'darkolivegreen','556b2f',
  109          'darkorange','ff8c00', 'darkorchid','9932cc', 'darkred','8b0000', 'darksalmon','e9967a', 'darkseagreen','8fbc8f', 'darkslateblue','483d8b',
  110          'darkturquoise','00ced1', 'darkviolet','9400d3', 'deeppink','ff1493', 'deepskyblue','00bfff', 'deepskyblue','2f4f4f', 'dimgray','696969',
  111          'dodgerblue','1e90ff', 'firebrick','b22222', 'floralwhite','fffaf0', 'forestgreen','228b22', 'fuchsia','ff00ff', 'gainsboro','dcdcdc',
  112          'ghostwhite','f8f8ff', 'gold','ffd700', 'goldenrod','daa520', 'gray','808080', 'green','008000', 'greenyellow','adff2f',
  113          'honeydew','f0fff0', 'hotpink','ff69b4', 'indianred','cd5c5c', 'indigo','4b0082', 'ivory','fffff0', 'khaki','f0e68c',
  114          'lavender','e6e6fa', 'lavenderblush','fff0f5', 'lawngreen','7cfc00', 'lemonchiffon','fffacd', 'lightblue','add8e6',
  115          'lightcoral','f08080', 'lightcyan','e0ffff', 'lightgoldenrodyellow','fafad2', 'lightgreen','90ee90', 'lightgrey','d3d3d3',
  116          'lightpink','ffb6c1', 'lightsalmon','ffa07a', 'lightseagreen','20b2aa', 'lightskyblue','87cefa', 'lightslategray','778899',
  117          'lightsteelblue','b0c4de', 'lightyellow','ffffe0', 'lime','00ff00', 'limegreen','32cd32', 'linen','faf0e6', 'magenta','ff00ff',
  118          'maroon','800000', 'mediumaquamarine','66cdaa', 'mediumblue','0000cd', 'mediumorchid','ba55d3', 'mediumpurple','9370db',
  119          'mediumseagreen','3cb371', 'mediumslateblue','7b68ee', 'mediumspringgreen','00fa9a', 'mediumturquoise','48d1cc',
  120          'mediumvioletred','c71585', 'midnightblue','191970', 'mintcream','f5fffa', 'mistyrose','ffe4e1', 'moccasin','ffe4b5',
  121          'navajowhite','ffdead', 'navy','000080', 'oldlace','fdf5e6', 'olive','808000', 'olivedrab','6b8e23', 'orange','ffa500',
  122          'orangered','ff4500', 'orchid','da70d6', 'palegoldenrod','eee8aa', 'palegreen','98fb98', 'paleturquoise','afeeee',
  123          'palevioletred','db7093', 'papayawhip','ffefd5', 'peachpuff','ffdab9', 'peru','cd853f', 'pink','ffc0cb', 'plum','dda0dd',
  124          'powderblue','b0e0e6', 'purple','800080', 'red','ff0000', 'rosybrown','bc8f8f', 'royalblue','4169e1', 'saddlebrown','8b4513',
  125          'salmon','fa8072', 'sandybrown','f4a460', 'seagreen','2e8b57', 'seashell','fff5ee', 'sienna','a0522d', 'silver','c0c0c0',
  126          'skyblue','87ceeb', 'slateblue','6a5acd', 'slategray','708090', 'snow','fffafa', 'springgreen','00ff7f', 'steelblue','4682b4',
  127          'tan','d2b48c', 'teal','008080', 'thistle','d8bfd8', 'tomato','ff6347', 'turquoise','40e0d0', 'violet','ee82ee', 'wheat','f5deb3',
  128          'white','ffffff', 'whitesmoke','f5f5f5', 'yellow','ffff00', 'yellowgreen','9acd32' }; # PROFILE BLOCK STOP
  129  
  130      $self->{content_type__} = '';
  131      $self->{base64__}       = '';
  132      $self->{in_html_tag__}  = 0;
  133      $self->{html_tag__}     = '';
  134      $self->{html_arg__}     = '';
  135      $self->{in_headers__}   = 0;
  136      $self->{first20__}      = '';
  137  
  138      return bless $self, $type;
  139  }
  140  
  141  # ---------------------------------------------------------------------------------------------
  142  #
  143  # compute_rgb_distance
  144  #
  145  # Given two RGB colors compute the distance between them by considering them as points
  146  # in 3 dimensions and calculating the distance between them (or equivalently the length
  147  # of a vector between them)
  148  #
  149  # $left          One color
  150  # $right         The other color
  151  #
  152  # ---------------------------------------------------------------------------------------------
  153  sub compute_rgb_distance
  154  {
  155      my ( $self, $left, $right ) = @_;
  156  
  157      # Figure out where the left color is and then subtract the right
  158      # color (point from it) to get the vector
  159  
  160      $left =~ /^(..)(..)(..)$/;
  161      my ( $rl, $gl, $bl ) = ( hex($1), hex($2), hex($3) );
  162  
  163      $right =~ /^(..)(..)(..)$/;
  164      my ( $r, $g, $b ) = ( $rl - hex($1), $gl - hex($2), $bl - hex($3) );
  165  
  166      # Now apply Pythagoras in 3D to get the distance between them, we return
  167      # the int because we don't need decimal level accuracy
  168  
  169      return int( sqrt( $r*$r + $g*$g + $b*$b ) );
  170  }
  171  
  172  # ---------------------------------------------------------------------------------------------
  173  #
  174  # compute_html_color_distance
  175  #
  176  # Calls compute_rgb_distance to set up htmlcolordistance__ from the current HTML back and
  177  # font colors
  178  #
  179  # ---------------------------------------------------------------------------------------------
  180  sub compute_html_color_distance
  181  {
  182      my ( $self ) = @_;
  183  
  184      $self->{htmlcolordistance__} = $self->compute_rgb_distance( $self->{htmlfontcolor__}, $self->{htmlbackcolor__} );
  185  }
  186  
  187  # ---------------------------------------------------------------------------------------------
  188  #
  189  # map_color
  190  #
  191  # Convert an HTML color value into its canonical lower case hexadecimal form with no #
  192  #
  193  # $color        A color value found in a tag
  194  #
  195  # ---------------------------------------------------------------------------------------------
  196  sub map_color
  197  {
  198      my ( $self, $color ) = @_;
  199  
  200      # The canonical form is lowercase hexadecimal, so start by lowercasing and stripping any
  201      # initial #
  202  
  203      $color = lc( $color );
  204      $color =~ s/^#//;
  205  
  206      # Map color names to hexadecimal values
  207  
  208      if ( defined( $self->{color_map__}{$color} ) ) {
  209          return $self->{color_map__}{$color};
  210      } else {
  211          return $color;
  212      }
  213  }
  214  
  215  # ---------------------------------------------------------------------------------------------
  216  #
  217  # increment_word
  218  #
  219  # Updates the word frequency for a word without performing any coloring or transformation
  220  # on the word
  221  #
  222  # $word     The word
  223  #
  224  # ---------------------------------------------------------------------------------------------
  225  sub increment_word
  226  {
  227      my ($self, $word) = @_;
  228  
  229      $self->{words__}{$word} += 1;
  230      $self->{msg_total__}    += 1;
  231  
  232      print "--- $word ($self->{words__}{$word})\n" if ($self->{debug__});
  233  }
  234  
  235  # ---------------------------------------------------------------------------------------------
  236  #
  237  # update_pseudoword
  238  #
  239  # Updates the word frequency for a pseudoword, note that this differs from update_word
  240  # because it does no word mangling
  241  #
  242  # $prefix       The pseudoword prefix (e.g. header)
  243  # $word         The pseudoword (e.g. Mime-Version)
  244  # $encoded      Whether this was found inside encoded text
  245  # $literal      The literal text that generated this pseudoword
  246  #
  247  # ---------------------------------------------------------------------------------------------
  248  sub update_pseudoword
  249  {
  250      my ( $self, $prefix, $word, $encoded, $literal ) = @_;
  251  
  252      my $mword = "$prefix:$word";
  253  
  254      if ( $self->{color__} ) {
  255          if ( $encoded == 1 )  {
  256              $literal =~ s/</&lt;/g;
  257              $literal =~ s/>/&gt;/g;
  258              my $color = $self->{bayes__}->get_color($mword);
  259              my $to    = "<b><font color=\"$color\"><a title=\"$mword\">$literal</a></font></b>";
  260              $self->{ut__} .= $to . ' ';
  261          }
  262      }
  263  
  264     $self->increment_word( $mword );
  265  }
  266  
  267  # ---------------------------------------------------------------------------------------------
  268  #
  269  # update_word
  270  #
  271  # Updates the word frequency for a word
  272  #
  273  # $word         The word that is being updated
  274  # $encoded      1 if the line was found in encoded text (base64)
  275  # $before       The character that appeared before the word in the original line
  276  # $after        The character that appeared after the word in the original line
  277  # $prefix       A string to prefix any words with in the corpus, used for the special
  278  #               identification of values found in for example the subject line
  279  #
  280  # ---------------------------------------------------------------------------------------------
  281  sub update_word
  282  {
  283      my ($self, $word, $encoded, $before, $after, $prefix) = @_;
  284  
  285      my $mword = $self->{mangle__}->mangle($word);
  286  
  287      if ( $mword ne '' )  {
  288          $mword = $prefix . ':' . $mword if ( $prefix ne '' );
  289  
  290          if ( $prefix =~ /(from|to|cc|subject)/i ) {
  291              push @{$self->{quickmagnets__}{$prefix}}, $word;
  292          }
  293  
  294          if ( $self->{color__} ) {
  295              my $color = $self->{bayes__}->get_color($mword);
  296              if ( $encoded == 0 )  {
  297                  $after = '&' if ( $after eq '>' );
  298                  if ( !( $self->{ut__} =~ s/($before)\Q$word\E($after)/$1<b><font color=\"$color\">$word<\/font><\/b>$2/ ) ) {
  299                      print "Could not find $word for colorization\n" if ( $self->{debug__} );
  300                  }
  301              } else {
  302                  $self->{ut__} .= "<font color=\"$color\">$word<\/font> ";
  303              }
  304          }
  305  
  306          $self->increment_word( $mword );
  307      }
  308  }
  309  
  310  # ---------------------------------------------------------------------------------------------
  311  #
  312  # add_line
  313  #
  314  # Parses a single line of text and updates the word frequencies
  315  #
  316  # $bigline      The line to split into words and add to the word counts
  317  # $encoded      1 if the line was found in encoded text (base64)
  318  # $prefix       A string to prefix any words with in the corpus, used for the special
  319  #               identification of values found in for example the subject line
  320  #
  321  # ---------------------------------------------------------------------------------------------
  322  sub add_line
  323  {
  324      my ($self, $bigline, $encoded, $prefix) = @_;
  325      my $p = 0;
  326  
  327      print "add_line: [$bigline]\n" if $self->{debug__};
  328  
  329      # If the line is really long then split at every 1k and feed it to the parser below
  330  
  331      # Check the HTML back and font colors to ensure that we are not about to
  332      # add words that are hidden inside invisible ink
  333  
  334      if ( $self->{htmlfontcolor__} ne $self->{htmlbackcolor__} ) {
  335  
  336          # If we are adding a line and the colors are different then we will
  337          # add a count for the color difference to make sure that we catch
  338          # camouflage attacks using similar colors, if the color similarity
  339          # is less than 100.  I chose 100 somewhat arbitrarily but classic
  340          # black text on white background has a distance of 441, red/blue or
  341          # green on white has distance 255.  100 seems like a reasonable upper
  342          # bound for tracking evil spammer tricks with similar colors
  343  
  344          $self->compute_html_color_distance();
  345          if ( $self->{htmlcolordistance__} < 100 ) {
  346              $self->update_pseudoword( 'html', "colordistance$self->{htmlcolordistance__}", $encoded, '' );
  347          }
  348  
  349          while ( $p < length($bigline) ) {
  350              my $line = substr($bigline, $p, 1024);
  351  
  352              # mangle up html character entities
  353              # these are just the low ISO-Latin1 entities
  354              # see: http://www.w3.org/TR/REC-html32#latin1
  355              # TODO: find a way to make this (and other similar stuff) highlight
  356              #       without using the encoded content printer or modifying $self->{ut__}
  357  
  358              while ( $line =~ m/(&(\w{3,6});)/g ) {
  359                  my $from = $1;
  360                  my $to   = $entityhash{$2};
  361  
  362                  if ( defined( $to ) ) {
  363                      $to         = chr($to);
  364                      $line       =~ s/$from/$to/g;
  365                      $self->{ut__} =~ s/$from/$to/g;
  366                      print "$from -> $to\n" if $self->{debug__};
  367                  }
  368              }
  369  
  370              while ( $line =~ m/(&#([\d]{1,3});)/g ) {
  371  
  372                  # Don't decode odd (nonprintable) characters or < >'s.
  373  
  374                  if ( ( ( $2 < 255 ) && ( $2 > 63 ) ) || ( $2 == 61 ) || ( ( $2 < 60 ) && ( $2 > 31 ) ) ) {
  375                      my $from = $1;
  376                      my $to   = chr($2);
  377  
  378                      if ( defined( $to ) &&  ( $to ne '' ) ) {
  379                          $line       =~ s/$from/$to/g;
  380                          $self->{ut__} =~ s/$from/$to/g;
  381                          print "$from -> $to\n" if $self->{debug__};
  382                          $self->update_pseudoword( 'html', 'numericentity', $encoded, $from );
  383                      }
  384                  }
  385              }
  386  
  387              # Pull out any email addresses in the line that are marked with <> and have an @ in them
  388  
  389              while ( $line =~ s/(mailto:)?([[:alpha:]0-9\-_\.]+?@([[:alpha:]0-9\-_\.]+\.[[:alpha:]0-9\-_]+))([\&\)\?\:\/ >\&\;])// )  {
  390                  update_word($self, $2, $encoded, ($1?$1:''), '[\&\?\:\/ >\&\;]', $prefix);
  391                  add_url($self, $3, $encoded, '\@', '[\&\?\:\/]', $prefix);
  392              }
  393  
  394              # Grab domain names
  395              while ( $line =~ s/(([[:alpha:]0-9\-_]+\.)+)(com|edu|gov|int|mil|net|org|aero|biz|coop|info|museum|name|pro)([^[:alpha:]0-9\-_\.]|$)/$4/i )  {
  396                   add_url($self, "$1$3", $encoded, '', '', $prefix);
  397              }
  398  
  399              # Grab IP addresses
  400  
  401              while ( $line =~ s/(([12]?\d{1,2}\.){3}[12]?\d{1,2})// )  {
  402                  update_word($self, "$1", $encoded, '', '', $prefix);
  403              }
  404  
  405              # Deal with runs of alternating spaces and letters
  406  
  407              foreach my $space (' ', '\'', '*', '^', '`', '  ', '\38', '.' ){
  408                  while ( $line =~ s/( |^)(([A-Z]\Q$space\E){2,15}[A-Z])( |\Q$space\E|[!\?,])/ /i ) {
  409                      my $original = "$1$2$4";
  410                      my $word = $2;
  411                      print "$word ->" if $self->{debug__};
  412                      $word    =~ s/[^A-Z]//gi;
  413                      print "$word\n" if $self->{debug__};
  414                      $self->update_word( $word, $encoded, ' ', ' ', $prefix);
  415                      $self->update_pseudoword( 'trick', 'spacedout', $encoded, $original );
  416                  }
  417              }
  418  
  419              # Deal with random insertion of . inside words
  420  
  421              while ( $line =~ s/ ([A-Z]+)\.([A-Z]{2,}) / $1$2 /i ) {
  422                  $self->update_pseudoword( 'trick', 'dottedwords', $encoded, "$1$2" );
  423              }
  424  
  425              # Only care about words between 3 and 45 characters since short words like
  426              # an, or, if are too common and the longest word in English (according to
  427              # the OED) is pneumonoultramicroscopicsilicovolcanoconiosis
  428  
  429              while ( $line =~ s/([[:alpha:]][[:alpha:]\']{1,44})([_\-,\.\"\'\)\?!:;\/& \t\n\r]{0,5}|$)// ) {
  430                  if ( ( $self->{in_headers__} == 0 ) && ( $self->{first20count__} < 20 ) ) {
  431                      $self->{first20count__} += 1;
  432                      $self->{first20__} .= " $1";
  433                  }
  434  
  435                  update_word($self,$1, $encoded, '', '[_\-,\.\"\'\)\?!:;\/ &\t\n\r]', $prefix) if (length $1 >= 3);
  436              }
  437  
  438              $p += 1024;
  439          }
  440      } else {
  441          if ( $bigline =~ /[^ \t]/ ) {
  442              $self->update_pseudoword( 'trick', 'invisibleink', $encoded, $bigline );
  443          }
  444      }
  445  }
  446  
  447  # ---------------------------------------------------------------------------------------------
  448  #
  449  # update_tag
  450  #
  451  # Extract elements from within HTML tags that are considered important 'words' for analysis
  452  # such as domain names, alt tags,
  453  #
  454  # $tag      The tag name
  455  # $arg      The arguments
  456  # $end_tag  Whether this is an end tag or not
  457  # $encoded  1 if this HTML was found inside encoded (base64) text
  458  #
  459  # ---------------------------------------------------------------------------------------------
  460  sub update_tag
  461  {
  462      my ( $self, $tag, $arg, $end_tag, $encoded ) = @_;
  463  
  464      $tag =~ s/[\r\n]//g;
  465      $arg =~ s/[\r\n]//g;
  466  
  467      print "HTML tag $tag with argument " . $arg . "\n" if ($self->{debug__});
  468  
  469      # End tags do not require any argument decoding but we do look at them
  470      # to make sure that we handle /font to change the font color
  471  
  472      if ( $end_tag ) {
  473          if ( $tag =~ /^font$/i ) {
  474              $self->{htmlfontcolor__} = map_color( $self, 'black' );
  475              $self->compute_html_color_distance();
  476          }
  477  
  478          # If we hit a table tag then any font information is lost
  479  
  480          if ( $tag =~ /^(table|td|tr|th)$/i ) {
  481              $self->{htmlfontcolor__} = map_color( $self, 'black' );
  482              $self->{htmlbackcolor__} = $self->{htmlbodycolor__};
  483              $self->compute_html_color_distance();
  484          }
  485  
  486          return;
  487      }
  488  
  489      # Count the number of TD elements
  490      $self->update_pseudoword('html', 'td', $encoded, $tag ) if ( $tag =~ /^td$/i );
  491  
  492      my $attribute;
  493      my $value;
  494  
  495      # These are used to pass good values to update_word
  496  
  497      my $quote;
  498      my $end_quote;
  499  
  500      # Strip the first attribute while there are any attributes
  501      # Match the closing attribute character, if there is none
  502      # (this allows nested single/double quotes),
  503      # match a space or > or EOL
  504  
  505      my $original;
  506  
  507      while ( $arg =~ s/[ \t]*((\w+)[ \t]*=[ \t]*([\"\'])?(.*?)(\3|($|([ \t>]))))//i ) {
  508          $original  = $1;
  509          $attribute = $2;
  510          $value     = $4;
  511          $quote     = '';
  512          $end_quote = '[\> \t\&\n]';
  513          if (defined $3) {
  514              $quote     = $3;
  515              $end_quote = $3;
  516          }
  517  
  518          print "   attribute $attribute with value $quote$value$quote\n" if ($self->{debug__});
  519  
  520          # Remove leading whitespace and leading value-less attributes
  521  
  522          if ( $arg =~ s/^(([ \t]*(\w+)[\t ]+)+)([^=])/$4/ ) {
  523              print "   attribute(s) " . $1 . " with no value\n" if ($self->{debug__});
  524          }
  525  
  526          # Toggle for parsing script URI's.
  527          # Should be left off (0) until more is known about how different html
  528          # rendering clients behave.
  529  
  530          my $parse_script_uri = 0;
  531  
  532          # Tags with src attributes
  533  
  534          if ( ( $attribute =~ /^src$/i ) && # PROFILE BLOCK START
  535               ( ( $tag =~ /^img|frame|iframe$/i )
  536                 || ( $tag =~ /^script$/i && $parse_script_uri ) ) ) { # PROFILE BLOCK STOP
  537  
  538              # "CID:" links refer to an origin-controlled attachment to a html email.
  539              # Adding strings from these, even if they appear to be hostnames, may or
  540              # may not be beneficial
  541  
  542              if ($value =~ /^cid\:/i )
  543              {
  544                  # TODO: Decide what to do here, ignoring CID's for now
  545              } else {
  546  
  547                  my $host = add_url( $self, $value, $encoded, $quote, $end_quote, '', 1 );
  548  
  549                  # If the host name is not blank (i.e. there was a hostname in the url
  550                  # and it was an image, then if the host was not this host then report
  551                  # an off machine image
  552  
  553                  if ( ( $host ne '' ) && ( $tag =~ /^img$/i ) ) {
  554                      if ( $host ne 'localhost' ) {
  555                          $self->update_pseudoword( 'html', 'imgremotesrc', $encoded, $original );
  556                      }
  557                  }
  558  
  559                  next;
  560              }
  561  
  562              add_url( $self, $value, $encoded, $quote, $end_quote, '' );
  563              next;
  564          }
  565  
  566          # Tags with href attributes
  567  
  568          if ( $attribute =~ /^href$/i && $tag =~ /^(a|link|base|area)$/i )  {
  569  
  570              # Look for mailto:'s
  571  
  572              if ($value =~ /^mailto:/i) {
  573                  if ( $tag =~ /^a$/ && $value =~ /^mailto:([[:alpha:]0-9\-_\.]+?@([[:alpha:]0-9\-_\.]+?))([>\&\?\:\/]|$)/i )  {
  574                     update_word( $self, $1, $encoded, 'mailto:', ($3?'[\\\>\&\?\:\/]':$end_quote), '' );
  575                     add_url( $self, $2, $encoded, '@', ($3?'[\\\&\?\:\/]':$end_quote), '' );
  576                  }
  577              } else {
  578  
  579                  # Anything that isn't a mailto is probably an URL
  580  
  581                  $self->add_url($value, $encoded, $quote, $end_quote, '');
  582              }
  583  
  584              next;
  585          }
  586  
  587          # Tags with alt attributes
  588  
  589          if ( $attribute =~ /^alt$/i && $tag =~ /^img$/i )  {
  590              add_line($self, $value, $encoded, '');
  591              next;
  592           }
  593  
  594          # Tags with working background attributes
  595  
  596          if ( $attribute =~ /^background$/i && $tag =~ /^(td|table|body)$/i ) {
  597              add_url( $self, $value, $encoded, $quote, $end_quote, '' );
  598              next;
  599          }
  600  
  601          # Tags that load sounds
  602  
  603          if ( $attribute =~ /^bgsound$/i && $tag =~ /^body$/i ) {
  604              add_url( $self, $2, $encoded, $quote, $end_quote, '' );
  605              next;
  606          }
  607  
  608  
  609          # Tags with colors in them
  610  
  611          if ( ( $attribute =~ /^color$/i ) && ( $tag =~ /^font$/i ) ) {
  612              update_word( $self, $value, $encoded, $quote, $end_quote, '' );
  613              $self->update_pseudoword( 'html', "fontcolor$value", $encoded, $original );
  614              $self->{htmlfontcolor__} = map_color($self, $value);
  615              $self->compute_html_color_distance();
  616              print "Set html font color to $self->{htmlfontcolor__}\n" if ( $self->{debug__} );
  617          }
  618  
  619          if ( ( $attribute =~ /^text$/i ) && ( $tag =~ /^body$/i ) ) {
  620              $self->update_pseudoword( 'html', "fontcolor$value", $encoded, $original );
  621              update_word( $self, $value, $encoded, $quote, $end_quote, '' );
  622              $self->{htmlfontcolor__} = map_color($self, $value);
  623              $self->compute_html_color_distance();
  624              print "Set html font color to $self->{htmlfontcolor__}\n" if ( $self->{debug__} );
  625          }
  626  
  627          # The width and height of images
  628  
  629          if ( ( $attribute =~ /^(width|height)$/i ) && ( $tag =~ /^img$/i ) ) {
  630              $attribute = lc( $attribute );
  631              $self->update_pseudoword( 'html', "img$attribute$value", $encoded, $original );
  632          }
  633  
  634          # Font sizes
  635  
  636          if ( ( $attribute =~ /^size$/i ) && ( $tag =~ /^font$/i ) ) {
  637              $self->update_pseudoword( 'html', "fontsize$value", $encoded, $original );
  638          }
  639  
  640          # Tags with background colors
  641  
  642          if ( ( $attribute =~ /^(bgcolor|back)$/i ) && ( $tag =~ /^(td|table|body|tr|th|font)$/i ) ) {
  643              update_word( $self, $value, $encoded, $quote, $end_quote, '' );
  644              $self->update_pseudoword( 'html', "backcolor$value" );
  645              $self->{htmlbackcolor__} = map_color($self, $value);
  646              print "Set html back color to $self->{htmlbackcolor__}\n" if ( $self->{debug__} );
  647  
  648              $self->{htmlbodycolor__} = $self->{htmlbackcolor__} if ( $tag =~ /^body$/i );
  649              $self->compute_html_color_distance();
  650          }
  651  
  652          # Tags with a charset
  653  
  654          if ( ( $attribute =~ /^content$/i ) && ( $tag =~ /^meta$/i ) ) {
  655              if ( $value=~ /charset=([^ ]{1,40})[\"\>]?/ ) {
  656                  update_word( $self, $1, $encoded, '', '', '' );
  657              }
  658          }
  659  
  660          # Tags with style attributes (this one may impact performance!!!)
  661          # most container tags accept styles, and the background style may
  662          # not be in a predictable location (search the entire value)
  663  
  664          if ( $attribute =~ /^style$/i && $tag =~ /^(body|td|tr|table|span|div|p)$/i ) {
  665              add_url( $self, $1, $encoded, '[\']', '[\']', '' ) if ( $value =~ /background\-image:[ \t]?url\([ \t]?\'(.*)\'[ \t]?\)/i );
  666              next;
  667          }
  668  
  669          # Tags with action attributes
  670  
  671          if ( $attribute =~ /^action$/i && $tag =~ /^form$/i )  {
  672              if ( $value =~ /^(ftp|http|https):\/\//i ) {
  673                  add_url( $self, $value, $encoded, $quote, $end_quote, '' );
  674                  next;
  675              }
  676  
  677              # mailto forms
  678  
  679              if ( $value =~ /^mailto:([[:alpha:]0-9\-_\.]+?@([[:alpha:]0-9\-_\.]+?))([>\&\?\:\/])/i )  {
  680                 update_word( $self, $1, $encoded, 'mailto:', ($3?'[\\\>\&\?\:\/]':$end_quote), '' );
  681                 add_url( $self, $2, $encoded, '@', ($3?'[\\\>\&\?\:\/]':$end_quote), '' );
  682              }
  683              next;
  684          }
  685      }
  686  }
  687  
  688  # ---------------------------------------------------------------------------------------------
  689  #
  690  # add_url
  691  #
  692  # Parses a single url or domain and identifies interesting parts
  693  #
  694  # $url          the domain name to handle
  695  # $encoded      1 if the domain was found in encoded text (base64)
  696  # $before       The character that appeared before the URL in the original line
  697  # $after        The character that appeared after the URL in the original line
  698  # $prefix       A string to prefix any words with in the corpus, used for the special
  699  #               identification of values found in for example the subject line
  700  # $noadd        If defined indicates that only parsing should be done, no word updates
  701  #
  702  # Returns the hostname
  703  #
  704  # ---------------------------------------------------------------------------------------------
  705  sub add_url
  706  {
  707      my ($self, $url, $encoded, $before, $after, $prefix, $noadd) = @_;
  708  
  709      my $temp_url = $url;
  710      my $temp_before;
  711      my $temp_after;
  712      my $hostform;   #ip or name
  713  
  714      # parts of a URL, from left to right
  715      my $protocol;
  716      my $authinfo;
  717      my $host;
  718      my $port;
  719      my $path;
  720      my $query;
  721      my $hash;
  722  
  723      # Strip the protocol part of a URL (e.g. http://)
  724  
  725      $protocol = $1 if ( $url =~ s/^([^:]*)\:\/\/// );
  726  
  727      # Remove any URL encoding (protocol may not be URL encoded)
  728  
  729      my $oldurl   = $url;
  730      my $percents =  ( $url =~ s/(%([0-9A-Fa-f]{2}))/chr(hex("0x$2"))/ge );
  731  
  732      if ( $percents > 0 ) {
  733          $self->update_pseudoword( 'html', 'encodedurl', $encoded, $oldurl ) if ( !defined( $noadd ) );
  734      }
  735  
  736      # Extract authorization information from the URL (e.g. http://foo@bar.com)
  737  
  738      $authinfo = $1 if ( $url =~ s/^(([[:alpha:]0-9\-_\.\;\:\&\=\+\$\,]+)(\@|\%40))+// );
  739  
  740      $self->update_pseudoword( 'html', 'authorization', $encoded, $oldurl ) if ( defined( $authinfo ) && ( $authinfo ne '' ) );
  741  
  742      if ( $url =~ s/^(([[:alpha:]0-9\-_]+\.)+)(com|edu|gov|int|mil|net|org|aero|biz|coop|info|museum|name|pro|[[:alpha:]]{2})([^[:alpha:]0-9\-_\.]|$)/$4/i ) {
  743          $host = "$1$3";
  744          $hostform = "name";
  745      } else {
  746          if ( $url =~ /(([^:\/])+)/ ) {
  747  
  748              # Some other hostname format found, maybe
  749              # Read here for reference: http://www.pc-help.org/obscure.htm
  750              # Go here for comparison: http://www.samspade.org/t/url
  751  
  752              # save the possible hostname
  753  
  754              my $host_candidate = $1;
  755  
  756              # stores discovered IP address
  757  
  758              my %quads;
  759  
  760              # temporary values
  761  
  762              my $quad = 1;
  763              my $number;
  764  
  765              # iterate through the possible hostname, build dotted quad format
  766  
  767              while ($host_candidate =~ s/\G^((0x)[0-9A-Fa-f]+|0[0-7]+|[0-9]+)(\.)?//) {
  768                  my $hex = $2;
  769  
  770                  # possible IP quad(s)
  771  
  772                  my $quad_candidate = $1;
  773                  my $more_dots      = $3;
  774  
  775                  if (defined $hex) {
  776  
  777                      # hex number
  778                      # trim arbitrary octets that are greater than most significant bit
  779  
  780                      $quad_candidate =~ s/.*(([0-9A-F][0-9A-F]){4})$/$1/i;
  781                      $number = hex( $quad_candidate );
  782                  } else {
  783                      if ( $quad_candidate =~ /^0([0-7]+)/ )  {
  784  
  785                          # octal number
  786  
  787                          $number = oct($1);
  788                      } else {
  789  
  790                          # assume decimal number
  791                          # deviates from the obscure.htm document here, no current browsers overflow
  792  
  793                          $number = int($quad_candidate);
  794                      }
  795                  }
  796  
  797                  # No more IP dots?
  798  
  799                  if ( !defined( $more_dots ) ) {
  800  
  801                      # Expand final decimal/octal/hex to extra quads
  802  
  803                      while ( $quad <= 4 ) {
  804                          my $shift = ((4 - $quad) * 8);
  805                          $quads{$quad} = ($number & (hex("0xFF") << $shift) ) >> $shift;
  806                          $quad += 1;
  807                      }
  808                  } else {
  809  
  810                      # Just plug the quad in, no overflow allowed
  811  
  812                      $quads{$quad} = $number if ($number < 256);
  813                      $quad += 1;
  814                  }
  815  
  816                  last if ( $quad > 4 );
  817              }
  818  
  819              $host_candidate =~ s/\r|\n|$//g;
  820              if ( ( $host_candidate eq '' ) && # PROFILE BLOCK START
  821                   defined( $quads{1} )      &&
  822                   defined( $quads{2} )      &&
  823                   defined( $quads{3} )      &&
  824                   defined( $quads{4} )      &&
  825                   !defined( $quads{5} ) ) {    # PROFILE BLOCK STOP
  826  
  827                  # we did actually find an IP address, and not some fake
  828  
  829                  $hostform = "ip";
  830                  $host = "$quads{1}.$quads{2}.$quads{3}.$quads{4}";
  831                  $url =~ s/(([^:\/])+)//;
  832              }
  833          }
  834      }
  835  
  836      if ( !defined( $host ) || ( $host eq '' ) ) {
  837          print "no hostname found: [$temp_url]\n" if ($self->{debug__});
  838          return '';
  839      }
  840  
  841      $port = $1 if ( $url =~ s/^\:(\d+)//);
  842      $path = $1 if ( $url =~ s/^([\\\/][^\#\?\n]*)($)?// );
  843      $query = $1 if ( $url =~ s/^[\?]([^\#\n]*|$)?// );
  844      $hash = $1 if ( $url =~ s/^[\#](.*)$// );
  845  
  846      if ( !defined( $protocol ) || ( $protocol =~ /^(http|https)$/ ) ) {
  847          $temp_before = $before;
  848          $temp_before = "\:\/\/" if (defined $protocol);
  849          $temp_before = "[\@]" if (defined $authinfo);
  850  
  851          $temp_after = $after;
  852          $temp_after = "[\#]" if (defined $hash);
  853          $temp_after = "[\?]" if (defined $query);
  854          $temp_after = "[\\\\\/]" if (defined $path);
  855          $temp_after = "[\:]" if (defined $port);
  856  
  857          update_word( $self, $host, $encoded, $temp_before, $temp_after, $prefix) if ( !defined( $noadd ) );
  858  
  859          # decided not to care about tld's beyond the verification performed when
  860          # grabbing $host
  861          # special subTLD's can just get their own classification weight (eg, .bc.ca)
  862          # http://www.0dns.org has a good reference of ccTLD's and their sub-tld's if desired
  863  
  864          if ( $hostform eq "name" ) {
  865              while ( $host =~ s/^([^\.])+\.(.*\.(.*))$/$2/ ) {
  866                  update_word( $self, $2, $encoded, '[\.]', '[<]', $prefix) if ( !defined( $noadd ) );
  867              }
  868          }
  869      }
  870  
  871      # $protocol $authinfo $host $port $query $hash may be processed below if desired
  872      return $host;
  873  }
  874  
  875  # ---------------------------------------------------------------------------------------------
  876  #
  877  # parse_html
  878  #
  879  # Parse a line that might contain HTML information, returns 1 if we are still inside an
  880  # unclosed HTML tag
  881  #
  882  # $line     A line of text
  883  # $encoded  1 if this HTML was found inside encoded (base64) text
  884  #
  885  # ---------------------------------------------------------------------------------------------
  886  sub parse_html
  887  {
  888      my ( $self, $line, $encoded ) = @_;
  889  
  890      my $found = 1;
  891  
  892      $line =~ s/[\r\n]+//gm;
  893  
  894      print "parse_html: [$line] " . $self->{in_html_tag__} . "\n" if $self->{debug__};
  895  
  896      # Remove HTML comments and other tags that begin !
  897  
  898      while ( $line =~ s/(<!.*?>)// ) {
  899          $self->update_pseudoword( 'html', 'comment', $encoded, $1 );
  900          print "$line\n" if $self->{debug__};
  901      }
  902  
  903      while ( $found && ( $line ne '' ) ) {
  904          $found = 0;
  905  
  906          # If we are in an HTML tag then look for the close of the tag, if we get it then
  907          # handle the tag, if we don't then keep building up the arguments of the tag
  908  
  909          if ( $self->{in_html_tag__} )  {
  910              if ( $line =~ s/^(.*?)>// ) {
  911                  $self->{html_arg__} .= $1;
  912                  $self->{in_html_tag__} = 0;
  913                  $self->{html_tag__} =~ s/=\n ?//g;
  914                  $self->{html_arg__} =~ s/=\n ?//g;
  915                  update_tag( $self, $self->{html_tag__}, $self->{html_arg__}, $self->{html_end}, $encoded );
  916                  $self->{html_tag__} = '';
  917                  $self->{html_arg__} = '';
  918                  $found = 1;
  919                  next;
  920              } else {
  921                  $self->{html_arg__} .= $line;
  922                  return 1;
  923              }
  924          }
  925  
  926          # Does the line start with a HTML tag that is closed (i.e. has both the < and the
  927          # > present)?  If so then handle that tag immediately and continue
  928  
  929          if ( $line =~ s/^<([\/]?)([A-Za-z]+)([^>]*?)>// )  {
  930              update_tag( $self, $2, $3, ( $1 eq '/' ), $encoded );
  931              $found = 1;
  932              next;
  933          }
  934  
  935          # Does the line consist of just a tag that has no closing > then set up the global
  936          # vars that record the tag and return 1 to indicate to the caller that we have an
  937          # unclosed tag
  938  
  939          if ( $line =~ /^<([\/]?)([^ >]+)([^>]*)$/ )  {
  940              $self->{html_end}    = ( $1 eq '/' );
  941              $self->{html_tag__}    = $2;
  942              $self->{html_arg__}    = $3;
  943              $self->{in_html_tag__} = 1;
  944              return 1;
  945          }
  946  
  947          # There could be something on the line that needs parsing (such as a word), if we reach here
  948          # then we are not in an unclosed tag and so we can grab everything from the start of the line
  949          # to the end or the first < and pass it to the line parser
  950  
  951          if ( $line =~ s/^([^<]+)(<|$)/$2/ ) {
  952              $found = 1;
  953              add_line( $self, $1, $encoded, '' );
  954          }
  955      }
  956  
  957      return 0;
  958  }
  959  
  960  # ---------------------------------------------------------------------------------------------
  961  #
  962  # parse_file
  963  #
  964  # Read messages from file and parse into a list of words and frequencies, returns a colorized
  965  # HTML version of message if color__ is set
  966  #
  967  # $file     The file to open and parse
  968  #
  969  # ---------------------------------------------------------------------------------------------
  970  sub parse_file
  971  {
  972      my ( $self, $file ) = @_;
  973  
  974      $self->start_parse();
  975  
  976      open MSG, "<$file";
  977      binmode MSG;
  978  
  979      # Read each line and find each "word" which we define as a sequence of alpha
  980      # characters
  981  
  982      while (<MSG>) {
  983          $self->parse_line( $_ );
  984      }
  985  
  986      close MSG;
  987  
  988      $self->stop_parse();
  989  
  990      if ( $self->{color__} )  {
  991          $self->{colorized__} .= $self->{ut__} if ( $self->{ut__} ne '' );
  992  
  993          $self->{colorized__} .= "</tt>";
  994          $self->{colorized__} =~ s/(\r\n\r\n|\r\r|\n\n)/__BREAK____BREAK__/g;
  995          $self->{colorized__} =~ s/[\r\n]+/__BREAK__/g;
  996          $self->{colorized__} =~ s/__BREAK__/<br \/>/g;
  997  
  998          return $self->{colorized__};
  999      } else {
 1000          return '';
 1001      }
 1002  }
 1003  
 1004  # ---------------------------------------------------------------------------------------------
 1005  #
 1006  # start_parse
 1007  #
 1008  # Called to reset internal variables before parsing.  This is automatically called when using
 1009  # the parse_file API, and must be called before the first call to parse_line.
 1010  #
 1011  # ---------------------------------------------------------------------------------------------
 1012  sub start_parse
 1013  {
 1014      my ( $self ) = @_;
 1015  
 1016      # This will contain the mime boundary information in a mime message
 1017  
 1018      $self->{mime__} = '';
 1019  
 1020      # Contains the encoding for the current block in a mime message
 1021  
 1022      $self->{encoding__} = '';
 1023  
 1024      # Variables to save header information to while parsing headers
 1025  
 1026      $self->{header__} = '';
 1027      $self->{argument__} = '';
 1028  
 1029      # Clear the word hash
 1030  
 1031      $self->{content_type__} = '';
 1032  
 1033      # Base64 attachments are loaded into this as we read them
 1034  
 1035      $self->{base64__}       = '';
 1036  
 1037      # Variable to note that the temporary colorized storage is "frozen",
 1038      # and what type of freeze it is (allows nesting of reasons to freeze
 1039      # colorization)
 1040  
 1041      $self->{in_html_tag__} = 0;
 1042  
 1043      $self->{html_tag__}    = '';
 1044      $self->{html_arg__}    = '';
 1045  
 1046      $self->{words__}        = {};
 1047      $self->{msg_total__}    = 0;
 1048      $self->{from__}         = '';
 1049      $self->{to__}           = '';
 1050      $self->{cc__}           = '';
 1051      $self->{subject__}      = '';
 1052      $self->{ut__}           = '';
 1053      $self->{quickmagnets__} = {};
 1054  
 1055      $self->{htmlbackcolor__} = map_color( $self, 'white' );
 1056      $self->{htmlfontcolor__} = map_color( $self, 'black' );
 1057      $self->compute_html_color_distance();
 1058  
 1059      $self->{in_headers__} = 1;
 1060  
 1061      $self->{first20__}      = '';
 1062      $self->{first20count__} = 0;
 1063  
 1064      # Used to return a colorize page
 1065  
 1066      $self->{colorized__} = '';
 1067      $self->{colorized__} .= "<tt>" if ( $self->{color__} );
 1068  }
 1069  
 1070  # ---------------------------------------------------------------------------------------------
 1071  #
 1072  # stop_parse
 1073  #
 1074  # Called at the end of a parse job.  Automatically called if parse_file is used, must be
 1075  # called after the last call to parse_line.
 1076  #
 1077  # ---------------------------------------------------------------------------------------------
 1078  sub stop_parse
 1079  {
 1080      my ( $self ) = @_;
 1081  
 1082      $self->{colorized__} .= $self->clear_out_base64();
 1083  
 1084      # If we reach here and discover that we think that we are in an unclosed HTML tag then there
 1085      # has probably been an error (such as a < in the text messing things up) and so we dump
 1086      # whatever is stored in the HTML tag out
 1087  
 1088      if ( $self->{in_html_tag__} ) {
 1089          $self->add_line( $self->{html_tag__} . ' ' . $self->{html_arg__}, 0, '' );
 1090      }
 1091  
 1092      $self->{in_html_tag__} = 0;
 1093  }
 1094  
 1095  # ---------------------------------------------------------------------------------------------
 1096  #
 1097  # parse_line
 1098  #
 1099  # Called to parse a single line from a message.  If using this API directly then be sure
 1100  # to call start_parse before the first call to parse_line.
 1101  #
 1102  # $line               Line of file to parse
 1103  #
 1104  # ---------------------------------------------------------------------------------------------
 1105  sub parse_line
 1106  {
 1107      my ( $self, $read ) = @_;
 1108  
 1109      if ( $read ne '' ) {
 1110  
 1111          # For the Mac we do further splitting of the line at the CR characters
 1112  
 1113          while ( $read =~ s/(.*?)[\r\n]+// )  {
 1114              my $line = "$1\r\n";
 1115  
 1116              next if ( !defined($line) );
 1117  
 1118              print ">>> $line" if $self->{debug__};
 1119  
 1120              if ($self->{color__}) {
 1121  
 1122                  if (!$self->{in_html_tag__}) {
 1123                      $self->{colorized__} .= $self->{ut__};
 1124                      $self->{ut__} = '';
 1125                  }
 1126  
 1127                  $self->{ut__} .= splitline($line, $self->{encoding__});
 1128              }
 1129  
 1130              if ($self->{in_headers__}) {
 1131  
 1132                  # temporary colorization while in headers is handled within parse_header
 1133  
 1134                  $self->{ut__} = '';
 1135  
 1136                  # Check for blank line signifying end of headers
 1137  
 1138                  if ( $line =~ /^(\r\n|\r|\n)/) {
 1139  
 1140                       # Parse the last header
 1141                      ($self->{mime__},$self->{encoding__}) = $self->parse_header($self->{header__},$self->{argument__},$self->{mime__},$self->{encoding__});
 1142  
 1143                      # Clear the saved headers
 1144                      $self->{header__}   = '';
 1145                      $self->{argument__} = '';
 1146  
 1147                      $self->{ut__} .= splitline( "\015\012", 0 );
 1148  
 1149                      $self->{in_headers__} = 0;
 1150                      print "Header parsing complete.\n" if $self->{debug__};
 1151  
 1152                      next;
 1153                  }
 1154  
 1155  
 1156                  # If we have an email header then just keep the part after the :
 1157  
 1158                  if ( $line =~ /^([A-Za-z-]+):[ \t]*([^\n\r]*)/ )  {
 1159  
 1160                      # Parse the last header
 1161  
 1162                      ($self->{mime__},$self->{encoding__}) = $self->parse_header($self->{header__},$self->{argument__},$self->{mime__},$self->{encoding__}) if ($self->{header__} ne '');
 1163  
 1164                      # Save the new information for the current header
 1165  
 1166                      $self->{header__}   = $1;
 1167                      $self->{argument__} = $2;
 1168                      next;
 1169                  }
 1170  
 1171                  # Append to argument if the next line begins with whitespace (isn't a new header)
 1172  
 1173                  if ( $line =~ /^([\t ].*?)(\r\n|\r|\n)/ ) {
 1174                      $self->{argument__} .= "\015\012" . $1;
 1175                  }
 1176                  next;
 1177              }
 1178  
 1179              # If we are in a mime document then spot the boundaries
 1180  
 1181              if ( ( $self->{mime__} ne '' ) && ( $line =~ /^\-\-($self->{mime__})(\-\-)?/ ) ) {
 1182  
 1183                  # approach each mime part with fresh eyes
 1184  
 1185                  $self->{encoding__} = '';
 1186  
 1187                  if (!defined $2) {
 1188                      print "Hit MIME boundary --$1\n" if $self->{debug__};
 1189                      $self->{in_headers__} = 1;
 1190                  } else {
 1191  
 1192                      $self->{in_headers__} = 0;
 1193  
 1194                      my $boundary = $1;
 1195  
 1196                      print "Hit MIME boundary terminator --$1--\n" if $self->{debug__};
 1197  
 1198                      # escape to match escaped boundary characters
 1199  
 1200                      $boundary =~ s/(.*)/\Q$1\E/g;
 1201  
 1202                      my $temp_mime;
 1203  
 1204                      foreach my $aboundary (split(/\|/,$self->{mime__})) {
 1205                          if ($boundary ne $aboundary) {
 1206                              if (defined $temp_mime) {
 1207                                  $temp_mime = join('|', $temp_mime, $aboundary);
 1208                              } else {
 1209                                  $temp_mime = $aboundary
 1210                              }
 1211                          }
 1212                      }
 1213  
 1214                      $self->{mime__} = ($temp_mime || '');
 1215  
 1216                      print "MIME boundary list now $self->{mime__}\n" if $self->{debug__};
 1217                      $self->{in_headers__} = 0;
 1218                  }
 1219  
 1220                  next;
 1221              }
 1222  
 1223              # If we are still in the headers then make sure that we are on a line with whitespace
 1224              # at the start
 1225  
 1226              if ( $self->{in_headers__} ) {
 1227                  if ( $line =~ /^[ \t\r\n]/ ) {
 1228                      next;
 1229                  }
 1230              }
 1231  
 1232              # If we are doing base64 decoding then look for suitable lines and remove them
 1233              # for decoding
 1234  
 1235              if ( $self->{encoding__} =~ /base64/i ) {
 1236                  $line =~ s/[\r\n]//g;
 1237                  $line =~ s/!$//;
 1238                  $self->{base64__} .= $line;
 1239  
 1240                  next;
 1241              }
 1242  
 1243              next if ( !defined($line) );
 1244  
 1245              # Look for =?foo? syntax that identifies a charset
 1246  
 1247              if ( $line =~ /=\?([^ ]{1,40})\?/ ) {
 1248                  update_word( $self, $1, 0, '', '', 'charset' );
 1249              }
 1250  
 1251              # Decode quoted-printable
 1252  
 1253              if ( $self->{encoding__} =~ /quoted\-printable/i ) {
 1254                  $line       = decode_qp( $line );
 1255                  $line       =~ s/[\r\n]+$//g;
 1256                  $self->{ut__} = decode_qp( $self->{ut__} ) if ( $self->{color__} );
 1257              }
 1258  
 1259              parse_html( $self, $line, 0 );
 1260          }
 1261      }
 1262  }
 1263  
 1264  # ---------------------------------------------------------------------------------------------
 1265  #
 1266  # clear_out_base64
 1267  #
 1268  # If there's anything in the {base64__} then decode it and parse it, returns colorization
 1269  # information to be added to the colorized output
 1270  #
 1271  # ---------------------------------------------------------------------------------------------
 1272  sub clear_out_base64
 1273  {
 1274      my ( $self ) = @_;
 1275  
 1276      my $colorized = '';
 1277  
 1278      if ( $self->{base64__} ne '' ) {
 1279          my $decoded = '';
 1280  
 1281          $self->{ut__}     = '' if $self->{color__};
 1282          $self->{base64__} =~ s/ //g;
 1283  
 1284          print "Base64 data: " . $self->{base64__} . "\n" if ($self->{debug__});
 1285  
 1286          $decoded = decode_base64( $self->{base64__} );
 1287          $self->parse_html( $decoded, 1 );
 1288  
 1289          print "Decoded: " . $decoded . "\n" if ($self->{debug__});
 1290  
 1291          $self->{ut__} = "<b>Found in encoded data:</b> " . $self->{ut__} if ( $self->{color__} );
 1292  
 1293              if ( $self->{color__} )  {
 1294                  if ( $self->{ut__} ne '' )  {
 1295                      $colorized = $self->{ut__};
 1296                      $self->{ut__} = '';
 1297              }
 1298          }
 1299      }
 1300  
 1301      $self->{base64__} = '';
 1302  
 1303      return $colorized;
 1304  }
 1305  
 1306  # ---------------------------------------------------------------------------------------------
 1307  #
 1308  # decode_string - Decode MIME encoded strings used in the header lines in email messages
 1309  #
 1310  # $mystring     - The string that neeeds decode
 1311  #
 1312  # Return the decoded string, this routine recognizes lines of the form
 1313  #
 1314  # =?charset?[BQ]?text?=
 1315  #
 1316  # A B indicates base64 encoding, a Q indicates quoted printable encoding
 1317  # ---------------------------------------------------------------------------------------------
 1318  sub decode_string
 1319  {
 1320      # I choose not to use "$mystring = MIME::Base64::decode( $1 );" because some spam mails
 1321      # have subjects like: "Subject: adjpwpekm =?ISO-8859-1?Q?=B2=E1=A4=D1=AB=C7?= dopdalnfjpw".
 1322      # Therefore, it will be better to store the decoded text in a temporary variable and substitute
 1323      # the original string with it later. Thus, this subroutine returns the real decoded result.
 1324  
 1325      my ( $self, $mystring ) = @_;
 1326  
 1327      my $decode_it = '';
 1328  
 1329      while ( $mystring =~ /=\?[\w-]+\?(B|Q)\?(.*?)\?=/ig ) {
 1330          if ($1 eq "B") {
 1331              $decode_it = decode_base64( $2 );
 1332              $mystring =~ s/=\?[\w-]+\?B\?(.*?)\?=/$decode_it/i;
 1333          } else {
 1334              if ($1 eq "Q") {
 1335                  $decode_it = $2;
 1336                  $decode_it =~ s/\_/=20/g;
 1337                  $decode_it = decode_qp( $decode_it );
 1338                  $mystring =~ s/=\?[\w-]+\?Q\?(.*?)\?=/$decode_it/i;
 1339              }
 1340          }
 1341      }
 1342  
 1343      return $mystring;
 1344  }
 1345  
 1346  # ---------------------------------------------------------------------------------------------
 1347  #
 1348  # get_header - Returns the value of the from, to, subject or cc header
 1349  #
 1350  # $header      Name of header to return (note must be lowercase)
 1351  #
 1352  # ---------------------------------------------------------------------------------------------
 1353  sub get_header
 1354  {
 1355      my ( $self, $header ) = @_;
 1356  
 1357      return $self->{$header . '__'};
 1358  }
 1359  
 1360  
 1361  # ---------------------------------------------------------------------------------------------
 1362  #
 1363  # parse_header - Performs parsing operations on a message header
 1364  #
 1365  # $header       Name of header being processed
 1366  # $argument     Value of header being processed
 1367  # $mime         The presently saved mime boundaries list
 1368  # $encoding     Current message encoding
 1369  #
 1370  # ---------------------------------------------------------------------------------------------
 1371  sub parse_header
 1372  {
 1373      my ($self, $header, $argument, $mime, $encoding) = @_;
 1374  
 1375      print "Header ($header) ($argument)\n" if ($self->{debug__});
 1376  
 1377      if ( $self->{color__} ) {
 1378          my $color     = $self->{bayes__}->get_color( "header:$header" );
 1379  
 1380          my $fix_argument = $argument;
 1381          $fix_argument =~ s/</&lt;/g;
 1382          $fix_argument =~ s/>/&gt;/g;
 1383  
 1384          $self->{ut__} =  "<b><font color=\"$color\">$header</font></b>: $fix_argument\015\012";
 1385      }
 1386  
 1387      # After a discussion with Tim Peters and some looking at emails
 1388      # I'd received I discovered that the header names (case sensitive) are
 1389      # very significant in identifying different types of mail, for example
 1390      # much spam uses MIME-Version, MiME-Version and Mime-Version
 1391  
 1392      $self->update_pseudoword( 'header', $header, 0, $header );
 1393  
 1394      # Check the encoding type in all RFC 2047 encoded headers
 1395  
 1396      if ( $argument =~ /=\?([^ ]{1,40})\?(Q|B)/i ) {
 1397              update_word( $self, $1, 0, '', '', 'charset' );
 1398      }
 1399  
 1400      # Handle the From, To and Cc headers and extract email addresses
 1401      # from them and treat them as words
 1402  
 1403      # For certain headers we are going to mark them specially in the corpus
 1404      # by tagging them with where they were found to help the classifier
 1405      # do a better job.  So if you have
 1406      #
 1407      # From: foo@bar.com
 1408      #
 1409      # then we'll add from:foo@bar.com to the corpus and not just foo@bar.com
 1410  
 1411      my $prefix = '';
 1412  
 1413      if ( $header =~ /^(From|To|Cc|Reply\-To)$/i ) {
 1414  
 1415          # These headers at least can be decoded
 1416  
 1417          $argument = $self->decode_string( $argument );
 1418  
 1419          if ( $argument =~ /=\?([^ ]{1,40})\?/ ) {
 1420              update_word( $self, $1, 0, '', '', 'charset' );
 1421          }
 1422  
 1423          if ( $header =~ /^From$/i )  {
 1424              $self->{from__} = $argument if ( $self->{from__} eq '' ) ;
 1425              $prefix = 'from';
 1426              push @{$self->{quickmagnets__}{$prefix}}, $argument if ($argument ne '');
 1427          }
 1428  
 1429          if ( $header =~ /^To$/i ) {
 1430              $prefix = 'to';
 1431              $self->{to__} = $argument if ( $self->{to__} eq '' );
 1432          }
 1433  
 1434          if ( $header =~ /^Cc$/i ) {
 1435              $prefix = 'cc';
 1436              $self->{cc__} = $argument if ( $self->{cc__} eq '' );
 1437          }
 1438  
 1439          while ( $argument =~ s/<([[:alpha:]0-9\-_\.]+?@([[:alpha:]0-9\-_\.]+?))>// )  {
 1440              update_word($self, $1, 0, ';', '&',$prefix);
 1441              add_url($self, $2, 0, '@', '[&<]',$prefix);
 1442          }
 1443  
 1444          while ( $argument =~ s/([[:alpha:]0-9\-_\.]+?@([[:alpha:]0-9\-_\.]+))// )  {
 1445              update_word($self, $1, 0, '', '',$prefix);
 1446              add_url($self, $2, 0, '@', '',$prefix);
 1447          }
 1448  
 1449          add_line( $self, $argument, 0, $prefix );
 1450          return ($mime, $encoding);
 1451      }
 1452  
 1453      if ( $header =~ /^Subject$/i ) {
 1454          $prefix = 'subject';
 1455          $argument = $self->decode_string( $argument );
 1456          $self->{subject__} = $argument if ( ( $self->{subject__} eq '' ) );
 1457      }
 1458  
 1459      $self->{date__} = $argument if ( $header =~ /^Date$/i );
 1460  
 1461      # Look for MIME
 1462  
 1463      if ( $header =~ /^Content-Type$/i ) {
 1464          if ( $argument =~ /charset=\"?([^\" ]{1,40})\"?/ ) {
 1465              update_word( $self, $1, 0, '' , '', 'charset' );
 1466          }
 1467  
 1468          if ( $argument =~ /^(.*?)(;)/ ) {
 1469              print "Set content type to $1\n" if $self->{debug__};
 1470              $self->{content_type__} = $1;
 1471          }
 1472  
 1473          if ( $argument =~ /multipart\//i ) {
 1474              my $boundary = $argument;
 1475  
 1476              if ( $boundary =~ /boundary= ?(\"([A-Z0-9\'\(\)\+\_\,\-\.\/\:\=\?][A-Z0-9\'\(\)\+_,\-\.\/:=\? ]{0,69})\"|([^\(\)\<\>\@\,\;\:\\\"\/\[\]\?\=]{1,70}))/i ) {
 1477  
 1478                  $boundary = ($2 || $3);
 1479  
 1480                  $boundary =~ s/(.*)/\Q$1\E/g;
 1481  
 1482                  if ($mime ne '') {
 1483  
 1484                      # Fortunately the pipe character isn't a valid mime boundary character!
 1485  
 1486                      $mime = join('|', $mime, $boundary);
 1487                  } else {
 1488                      $mime = $boundary;
 1489                  }
 1490                  print "Set mime boundary to " . $mime . "\n" if $self->{debug__};
 1491                  return ($mime, $encoding);
 1492              }
 1493          }
 1494          return ($mime, $encoding);
 1495      }
 1496  
 1497      # Look for the different encodings in a MIME document, when we hit base64 we will
 1498      # do a special parse here since words might be broken across the boundaries
 1499  
 1500      if ( $header =~ /^Content-Transfer-Encoding$/i ) {
 1501          $encoding = $argument;
 1502          print "Setting encoding to $encoding\n" if $self->{debug__};
 1503          my $compact_encoding = $encoding;
 1504          $compact_encoding =~ s/[^A-Za-z0-9]//g;
 1505          $self->update_pseudoword( 'encoding', $compact_encoding, 0, $encoding );
 1506          return ($mime, $encoding);
 1507      }
 1508  
 1509      # Some headers to discard
 1510  
 1511      return ($mime, $encoding) if ( $header =~ /^(Thread-Index|X-UIDL|Message-ID|X-Text-Classification|X-Mime-Key)$/i );
 1512  
 1513      # Some headers should never be RFC 2047 decoded
 1514  
 1515      $argument = $self->decode_string($argument) unless ($header =~ /^(Received|Content\-Type|Content\-Disposition)$/i);
 1516  
 1517      add_line( $self, $argument, 0, $prefix );
 1518  
 1519      return ($mime, $encoding);
 1520  }
 1521  
 1522  
 1523  # ---------------------------------------------------------------------------------------------
 1524  #
 1525  # splitline - Escapes characters so a line will print as plain-text within a HTML document.
 1526  #
 1527  # $line         The line to escape
 1528  # $encoding     The value of any current encoding scheme
 1529  #
 1530  # ---------------------------------------------------------------------------------------------
 1531  
 1532  sub splitline
 1533  {
 1534      my ($line, $encoding) = @_;
 1535      $line =~ s/([^\r\n]{100,120} )/$1\r\n/g;
 1536      $line =~ s/([^ \r\n]{120})/$1\r\n/g;
 1537  
 1538      $line =~ s/</&lt;/g;
 1539      $line =~ s/>/&gt;/g;
 1540  
 1541      if ( $encoding =~ /quoted\-printable/i ) {
 1542          $line =~ s/=3C/&lt;/g;
 1543          $line =~ s/=3E/&gt;/g;
 1544      }
 1545  
 1546      $line =~ s/\t/&nbsp;&nbsp;&nbsp;&nbsp;/g;
 1547  
 1548      return $line;
 1549  }
 1550  
 1551  # GETTERS/SETTERS
 1552  
 1553  sub first20
 1554  {
 1555     my ( $self ) = @_;
 1556  
 1557     return $self->{first20__};
 1558  }
 1559  
 1560  sub quickmagnets
 1561  {
 1562     my ( $self ) = @_;
 1563  
 1564     return $self->{quickmagnets__};
 1565  }
 1566  
 1567  1;
 1568  
 1569