#!/usr/bin/perl

use strict;
use warnings;

use File::Temp ();
use LWP::Simple;
use URI::Escape;
use Data::Dumper;

my $FROM_CACHE; if (@ARGV && $ARGV[0] eq '--fromcache') { $FROM_CACHE = 1; }
my $MAKE_CACHE; $MAKE_CACHE = 1;        # turn this on by default, no harm

# we allow promotion of rules that are "ifplugin" one of these
my @def_plugins = map {
    s,^lib/Mail/SpamAssassin/Plugin/(\S+)\.pm$,Mail::SpamAssassin::Plugin::$1,gs;
    $_;
  } <lib/Mail/SpamAssassin/Plugin/*.pm>;

my $PROMOTABLE_PLUGINS_RE = "^" . join("|", @def_plugins) . "\$";

# number of days to look back; if a rule isn't listed as promotable on
# all of these days, it won't be listed.  (we grant an exception for
# new rules that didn't exist on previous days, however, so new rules
# can be published quickly to handle sudden outbreaks without requiring
# manual update work)
my @DAYS_REQUIRED = (1, 2, 3, 4, 5);

###########################################################################

my $cgi_url = "https://ruleqa.spamassassin.org/";
my @doc = ();
my $cache = 'ruleqa.cache.';
my $submitters = '';
my $last_net;
my %outputs;

if (!$FROM_CACHE || !-f "${cache}net" || (-M "${cache}net") > 0.5) {
  my $neturl = $cgi_url."last-net?xml=1";
  warn "HTTP get: $neturl\n";
  $last_net = get ($neturl);
  if (!$last_net) {
    die "HTTP get failed: last-net\n";
  }
  if ($MAKE_CACHE) {
    open(O, ">${cache}net"); print O $last_net; close O;
  }
} else {
  open(I, "<${cache}net") or die; $last_net = join('',<I>); close I;
}

if ($last_net =~ m{
          <span\s+class="daterev_masscheck_description\smcviewing"
          .{0,400}
          <span\s+class="mcsubmitters">\s*(.*?)\s*</span>
        }sx)
  {

  my $netsubs = $1;
  ($submitters ne '') and $submitters .= "; ";
  $submitters .= "last-net: $netsubs";
} else {
  loghtml_die("no 'mcviewing', 'mcsubmitters' microformats for last-net");
}

my $netlist;
while ($last_net =~ m!<rule>(.*?)</rule>!xg) {
  my $xml = $1;
  my $obj = { };

  while ($xml =~ s!<([A-Za-z0-9_]+)>(.*?)</\1>!!) {
    $obj->{$1} = $2;
  }
  while ($xml =~ s!<([A-Za-z0-9_]+)\s+esc=["']1["']>(.*?)</\1>!!) {
    $obj->{$1} = uri_unescape($2);
  }

  my $name = $obj->{test};
  $obj->{detailhref} = $cgi_url.$obj->{detailhref};

  $netlist->{$name} = $obj;
}

if (!scalar keys %{$netlist}) {
  loghtml_die("no rules found? on last-net");
}

my $url;        # tracks the last day used
my $dayoffset = 0;
my $with_new_offset = 0;

foreach my $day (@DAYS_REQUIRED) {
  if (!$FROM_CACHE || !-f $cache.$day || (-M $cache.$day) > 0.5 || $with_new_offset) {

    $with_new_offset = 0;
    $url = $cgi_url.($day+$dayoffset)."-days-ago?xml=1";
    warn "HTTP get: $url\n";

    $doc[$day] = get ($url);
    if (!$doc[$day]) {
      die "HTTP get failed: $doc[$day]\n";
    }

    if ($MAKE_CACHE) {
      open(O, ">$cache$day"); print O $doc[$day]; close O;
    }
  }
  else {
    open(I, "<$cache$day") or die; $doc[$day] = join('',<I>); close I;
  }

###########################################################################

  # the HTML looks like:
  #
  #   <span class="daterev_masscheck_description" class="mcviewing">
  #   ...
  #   <em><span class="mcsubmitters"> ....... </span></em>
  #   ...
  #   </span>
  #
  # in other words, the machine-parseable metadata is embedded in the HTML
  # as a microformat.

  if ($doc[$day] =~ m{
          <span\s+class="daterev_masscheck_description\smcviewing"
          .{0,400}
          <span\s+class="mcsubmitters">\s*(.*?)\s*</span>
        }sx)
  {
    my $daysubs = $1;

    # ignore days when the mass-check sets contain a --net log, since
    # it's the weekly --net run.  That generally contains a much
    # smaller set of logs (since it takes longer to run mass-check --net)
    # so the results are untrustworthy.
    if ($daysubs =~ /(?:^|\s)net-/) {
      warn "day $day contains a --net mass-check! offsetting by an extra day\n";
      $dayoffset++;
      $with_new_offset = 1; redo;
    }

    ($submitters ne '') and $submitters .= "; ";
    $submitters .= "day $day: $daysubs";
  }
  else {
    loghtml_die("no 'mcviewing', 'mcsubmitters' microformats on day $day");
  }
}

###########################################################################

# <rule><test>__HIGHBITS</test><promo>0</promo>
# <spc>8.7654</spc><hpc>0.2056</hpc><so>0.977</so>
# <detailhref>ruleqa%3Fdaterev%3Dlast-night%26rule%3D__HIGHBITS%26s_detail%3D1</detailhref></rule>

my $plist;
my %ruleslist;
foreach my $day (@DAYS_REQUIRED) {
  while ($doc[$day] =~ m!<rule>(.*?)</rule>!xg) {
    my $xml = $1;
    my $obj = { };

    while ($xml =~ s!<([A-Za-z0-9_]+)>(.*?)</\1>!!) {
      $obj->{$1} = $2;
    }
    while ($xml =~ s!<([A-Za-z0-9_]+)\s+esc=["']1["']>(.*?)</\1>!!) {
      $obj->{$1} = uri_unescape($2);
    }

    my $name = $obj->{test};
    $ruleslist{$name} = 1;
    $obj->{detailhref} = $cgi_url.$obj->{detailhref};

    $plist->[$day]->{$name} = $obj;
  }

  if (!scalar keys %{$plist->[$day]}) {
    loghtml_die("no rules found? on day $day");
  }
}

sub median_array {
    my @vals = sort {$a <=> $b} @_;
    my $len = @vals;
    if($len%2) #odd?
    {
        return $vals[int($len/2)];
    }
    else #even
    {
        return ($vals[int($len/2)-1] + $vals[int($len/2)])/2;
    }
}

###########################################################################

## my $dump = Data::Dumper->Dump([$plist], ['promolist']); print $dump;

# use SpamAssassin classes directly, so we can lint rules
# as we go
use lib 'lib';
use Mail::SpamAssassin;

my $mailsa = Mail::SpamAssassin->new({
    rules_filename => "rules",
    site_rules_filename => join("\000", qw( rulesrc/core rulesrc/sandbox )),
    local_tests_only => 1,
    dont_copy_prefs => 1,
    config_tree_recurse => 1,
    keep_config_parsing_metadata => 1,
    # debug => 1,
});

# hack hack hack!!  we don't want to load plugin files twice,
# and since the mkrules compiler copies from rulesrc/sandbox/*/*.pm
# to rules/*.pm, they would otherwise appear twice.
foreach my $fname (<rules/*.pm>) {
  my $path = File::Spec->rel2abs($fname);
  $INC{$path} = 1;
  # warn "JMD $path";
}

my %rules_with_errors = ();

$mailsa->{lint_callback} = sub {
  my %opts = @_;

  # ignore non-rule-issue lint failures
  return if ($opts{msg} =~ /(?:
        score\sset\sfor\snon-existent|
        description\sexists
    )/x);

  warn "demoting $opts{rule}: $opts{msg}";
  if ($opts{iserror}) {
    $rules_with_errors{$opts{rule}}++;
  }
};

$mailsa->lint_rules();

print "# DO NOT EDIT: file generated by build/mkupdates/listpromotable\n";
print "# active ruleset list, automatically generated from $cgi_url\n";
print "# with results from: $submitters\n";

my @spcs = ($submitters =~ /\s+/g);
if (scalar @spcs < 2) {
  die "not generating results; less than 3 submitter results available!\n";
}

foreach my $netrule (sort keys %{$netlist}) {
  my $name = $netrule;
  my $notes = '';

  next if ($name =~ /^__/);

  my $no_t = $name;
  if ($no_t =~ s/^T_//) {
    if (defined $mailsa->{conf}->{scores}->{$no_t}) {
      $name = $no_t;
    }
  }

  # now that it's ok to have sandbox rules without a T_ prefix,
  # "T_" prefix implies "tflags nopublish"
  next if ($name =~ /^T_/);

  # ignore rules that don't exist (if they have a desc or score,
  # they exist according to the Conf parser)
  next unless ($mailsa->{conf}->{descriptions}->{$name}
        || $mailsa->{conf}->{scores}->{$name});

  my $tfs = $mailsa->{conf}->{tflags}->{$name};

  # "nopublish" tflags
  if ($tfs) {
    next if ($tfs =~ /\bnopublish\b/);
  }

  next if ($mailsa->{conf}->{testrules}->{$name});

  if ($tfs && $tfs =~ /\b(net)\b/) {
    $notes = "tflags ".$1;
    goto publish;
  }

  next;
publish:

  $outputs{$name} = $notes unless defined $outputs{$name};
}

foreach my $name (keys %ruleslist) {
  next if $name =~ /^__/;

  my $plistobj = $plist->[1]->{$name};
  my $notes = '';

  # rules in sandboxes without a T_ prefix, will be renamed during the
  # ruleqa process... in other words, the output freqs line will talk
  # about rule "T_FOO".   if there's a rule "FOO" defined, assume that's
  # the one being talked about.
  my $no_t = $name;
  if ($no_t =~ s/^T_//) {
    if (defined $mailsa->{conf}->{scores}->{$no_t}) {
      $name = $no_t;
    }
  }

  # now that it's ok to have sandbox rules without a T_ prefix,
  # "T_" prefix implies "tflags nopublish"
  next if ($name =~ /^T_/);

  # ignore rules that don't exist (if they have a desc or score,
  # they exist according to the Conf parser)
  next unless ($mailsa->{conf}->{descriptions}->{$name}
        || $mailsa->{conf}->{scores}->{$name});

  # "nopublish" tflags
  my $tfs = $mailsa->{conf}->{tflags}->{$name};
  if ($tfs) {
    next if ($tfs =~ /\bnopublish\b/);

    if ($tfs =~ /\b(publish)\b/) {
      $notes = "tflags ".$1;
      goto publish;
    }
  }

  # rule was from a file marked with "#testrules" (bug 5545)
  # note: this is after "tflags publish" support, so you can override
  # it on a rule-by-rule basis anyway
  next if ($mailsa->{conf}->{testrules}->{$name});

  # bug 6560, unless specifically declared #testrules;
  # all of these tflags force publication;
  # include "net", since otherwise this script has to be aware
  # what day of the week it is for weekly net/non-net mass-checks!
  # very messy.  TODO?
  if ($tfs && $tfs =~ /\b(userconf|learn|net)\b/) {
    $notes = "tflags ".$1;
    goto publish;
  }

  # only rules from "rulesrc" dirs
  my $src = $mailsa->{conf}->{source_file}->{$name};
  next if (!$src || $src !~ /rulesrc/);

  # rules that fail lint
  next if $rules_with_errors{$name};

  # base active on DAYS_REQUIRED days of checks
  # Find median for promo
  my @promo_arr;
  foreach my $day (@DAYS_REQUIRED) {
    if (defined $plist->[$day]->{$name}) {
      push (@promo_arr, $plist->[$day]->{$name}{promo});
    }
    if (defined $plist->[$day]->{"T_$name"}) {
      push (@promo_arr, $plist->[$day]->{"T_$name"}{promo});
    }
  }

  my $is_promo = median_array(@promo_arr);

  next unless $is_promo gt 0;

  # that require a plugin we won't have
  my $skip = 0;
  my $ifs = $mailsa->{conf}->{if_stack}->{$name};
  while ($ifs && $ifs =~ /plugin\s*\((.+?)\)/g) {
    my $pkg = $1;
    # grep out the ones we *do* have, and do use in "ifplugin"
    # lines in "rulesrc", here...
    next;   #JMD:
    next if ($pkg =~ /${PROMOTABLE_PLUGINS_RE}/o);
    print "\n# not publishing $name: needs $ifs\n";
    $skip++;
  }
  next if $skip;

  # don't output the ever-changing bits of data
  # $notes = "spam=$plistobj->{spc} ham=$plistobj->{hpc} so=$plistobj->{so}";
  $notes = "good enough";

publish:

  $outputs{$name} = $notes unless defined $outputs{$name};
}

foreach my $key (sort(keys %outputs)) {
  print "\n# ", $outputs{$key}, "\n", $key, "\n";
}
exit;

sub loghtml_die {
  die "$_[0]\nURL: $url\n";
}
