#!/usr/bin/perl
# -*- fill-column: 78 -*-

# tag2upload-oracled -- tag2upload simple Oracle protocol communicator

# Copyright (C) 2024-2025  Sean Whitton
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or (at
# your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

# usage:
#   tag2upload-oracled [--workers=WORKERS] [--no-restart-workers]	\
#                      [--ssh=SSH]					\
#                      [--autopkgtest-virt=autopkgtest-virt-SERVER]	\
#                      [--autopkgtest-arg=VIRT-SERVER-ARG] ...		\
#                      [--retain-tmp]					\
#                      --manager=MNGR-HOST --manager-socket=MNGR-SOCK	\
#                      --builder=BLDR-HOST				\
#                      --noreply=NOREPLY --copies=COPIES		\
#                      [--] DISTRO DISTRO-DIR AUTH-SPEC [<settings>]

use 5.028;
use warnings;
use POSIX qw(:signal_h strftime WNOHANG);
use IPC::Open2;
use URI::Escape;
use Getopt::Long;

use Debian::Dgit::Infra;	# must precede Debian::Dgit
use Debian::Dgit qw(!fail);
use Debian::Dgit::ProtoConn;

our ($workers_n, $restart_workers, $ssh, $adt_virt)
  = (1, 1, "ssh", "autopkgtest-virt-null");
our ($retain_tmp, $manager, $socket, $builder, $noreply, $copies, @adt_args);

GetOptions
  # Optional arguments.
  "workers=i"			=> \$workers_n,
  "ssh=s"			=> \$ssh,
  "autopkgtest-virt|adt-virt=s" => \$adt_virt,
  "retain-tmp"                  => \$retain_tmp,
  "autopkgtest-arg=s"		=> \@adt_args,
  "restart-workers!"		=> \$restart_workers,

  # Required arguments.
  "manager=s"			=> \$manager,
  "manager-socket=s"		=> \$socket,
  "builder=s"			=> \$builder,
  "noreply=s"			=> \$noreply,
  "copies=s"			=> \$copies;
$manager && $socket && $builder && $noreply && $copies
  or die "not enough arguments";

@ARGV >= 3 or die "not enough arguments for dgit-repos-server";
our @drs_args = @ARGV;

our @fatal_signals = qw(HUP TERM INT QUIT);
our $sigset
  = POSIX::SigSet->new(map { no strict; &{"SIG$_"} } @fatal_signals);

# We are expecting to be on a LAN with the Manager & Builder, so be fairly
# intolerant of connection issues.
our @ssh_opts = qw( -oBatchMode=yes -oConnectTimeout=30
		    -oServerAliveInterval=120 -oServerAliveCountMax=8 );

sub me { "$builder,$$" }
sub say_log {
    # We just output to STDERR for now.  Either systemd will pick it up for
    # its journal, or we will use some kind of remote syslogging.
    # Either way we will want to be able to inspect the live logs even though
    # we won't have shell access to the host running this daemon.
    #
    # We do only whole lines at once given these eventual expected outputs.
    printf STDERR "[t2u-oracled %s][%s] %s\n",
      me, strftime("%FT%T", gmtime), $_
      for @_
}

sub fail {
    my $msg = shift;
    say_log "error: $msg";
    die $msg;
}

# Main procedure.
{
    my %worker_pids;

    foreach my $sig (@fatal_signals) {
	$SIG{$sig} = sub {
	    say_log "group leader: received SIG$sig; shutting down workers";
	    local $SIG{CHLD} = "IGNORE";
	    kill $sig => keys %worker_pids;
	    exit 0;
	};
    }

    my $start_worker = sub {
	# Block signals so that we don't call our (parent-appropriate) signal
	# handler in the child right after fork, before the child has had a
	# chance to reset %SIG.
	sigprocmask(SIG_BLOCK, $sigset) or fail $!;
	if (my $child = fork // fail $!) {
	    sigprocmask(SIG_UNBLOCK, $sigset) or fail $!;
	    $worker_pids{$child}++;
	} else {
	    $SIG{$_} = "DEFAULT" for @fatal_signals;
	    sigprocmask(SIG_UNBLOCK, $sigset) or fail $!;
	    # Jump out of the parent process's lexical scope.
	    worker();
	    # worker() should never return, but ensure no grandchild workers.
	    exit 255;
	}
    };
    $start_worker->() for 1..$workers_n;

    for (;;) {
	# Now we do nothing until after at least one worker dies, then wait
	# for a bit longer before going round again to start up a replacement.
	# We start up one replacement at a time.
	#
	# If the worker died then it's probably because either the SSH
	# connection failed, or there was a bug triggered by the particular
	# manager request the worker was trying to handle.  In both cases it
	# is fine to restart workers: in the latter case, it's okay because no
	# state is shared between workers, and the manager shouldn't send the
	# bug-triggering request again immediately.
	#
	# In both cases, though, we want a delay.  In the second case this is
	# to prevent us getting stuck in a pointless tight forking loop if
	# workers are dying over and over again in quick succession.
	my $child = wait;
	if ($child == -1) {
	    fail "No workers to reap -- shouldn't be possible";
	} elsif (!$worker_pids{$child}) {
	    say_log "wait(2) returned unexpected PID $child";
	} else {
	    say_log "worker $child: ". waitstatusmsg;
	    delete $worker_pids{$child};
	    # This could become more sophisticated (e.g. exponential backoff)
	    # if necessary, but hopefully things will be reliable enough.
	    # We set this SIGALRM handler for the benefit of the test suite.
	    fail "group leader: restarting workers disabled"
	      unless $restart_workers;
	    $SIG{ALRM} = sub { undef };
	    sleep 20;
	    $SIG{ALRM} = "DEFAULT";
	    $start_worker->();
	}
    }
}

sub worker {
    my $mngr = Debian::Dgit::ProtoConn->open2(
	$ssh, @ssh_opts, $manager, qw(nc.openbsd -U), qq("$socket"));
    $mngr->set_description('manager');
    $mngr->set_fail_hook(
	sub {
	    my $msg = shift;
            (waitpid $mngr->get_pid(), WNOHANG) == 0
		or say_log "ssh to manager: ".waitstatusmsg;
	    eval { $mngr->send("protocol-violation $msg") };
	    say_log sprintf "%s to inform manager: %s",
	      ($@ ? "failed" : "attempted"), $msg;
	});
    $mngr->expect(sub { /^t2u-manager-ready$/ });
    $mngr->send("t2u-oracle-version 2");
    $mngr->send(sprintf "worker-id %s", me);
    for (;;) {
	my ($msg, $payld_id, $payld_url) = $mngr->expect(sub {
	    /^(?|(ayt)|(job) ([[:alnum:]][[:alnum:],-.]*) ([[:graph:]]+))$/a
	});
	if ($msg eq "ayt") {
	    $mngr->send("ack");
	} elsif ($msg eq "job") {
	    sigprocmask(SIG_BLOCK, $sigset) or fail $!;
	    handle_job($mngr, $payld_id, $payld_url);
	    sigprocmask(SIG_UNBLOCK, $sigset) or fail $!;
	} else {
	    fail "ProtoConn's expect() has failed us";
	}
    }
}

sub handle_job {
    my ($mngr, $id, $url) = @_;
    my $tag;

    $mngr->receive_data_blocks(sub {
	if ($tag) {
	    # There should not be a second data-block.
	    $mngr->bad("unexpected data-block");
	} else {
	    $tag = shift;
	}
    });

    # Parse it just enough to log something useful.
    # Leave the real parsing, and emailing, to dgit-repos-server.
    my ($tag_name) = $tag =~ /^tag (\S+)$/m or fail "couldn't find tag name";
    say_log "starting job $id, tag $tag_name for $url";

    # Use autopkgtest's virtualisation server protocol so that we can easily
    # upgrade the isolation.
    # Spec.: /usr/share/doc/autopkgtest/README.virtualisation-server.rst.gz
    #
    # The protocol requires that we ensure here, in this call to
    # Debian::Dgit::ProtoConn::open2, that the way we invoke the
    # virtualisation server will ensure that we have exclusive use of the
    # testbed.
    my $virt = Debian::Dgit::ProtoConn->open2(
	$ssh, @ssh_opts, $builder, $adt_virt, @adt_args);

    $virt->set_description('virt');
    $virt->set_fail_hook(sub {
        (waitpid $virt->get_pid(), WNOHANG) == 0
	    or say_log "virt server: ".waitstatusmsg;
    });

    $virt->expect(sub { /^ok$/ });
    $virt->send("open");
    my ($virt_dir) = $virt->expect(sub { /^ok (.+)$/ });
    $virt->send("print-execute-command");
    my ($virt_cmd_enclist) = $virt->expect(sub { /^ok (\S+)/ });

    # The diversion of the code path into dgit-repos-server now is for
    # historical reasons.  While invoking 'dgit rpush-source' is essential to
    # the design, the parts of dgit-repos-server we use could be refactored
    # and moved here.
    my @drs
      = ($ENV{DGIT_REPOS_SERVER_TEST} // qw(dgit-repos-server), @drs_args,
	 qw(--tag2upload5), $ssh, $builder, $virt_dir, $virt_cmd_enclist,
	 $noreply, $copies, qw(--), $id, $url, $tag_name);
    say_log "invoking <<@drs>>";
    my $drs_child = open2(my $drs_out, my $drs_in, @drs)
      // die "failed to start dgit-repos-server: $!";
    print $drs_in $tag;
    close $drs_in;

    # dgit-repos-server generates the remainder of the protocol messages, but
    # as this program is ultimately responsible for speaking the simple Oracle
    # protocol, we validate.
    chomp(my $drs_msg = <$drs_out>)    =~ /^message /;
    chomp(my $drs_status = <$drs_out>) =~ /^(?:irrecoverable|uploaded)$/;
    <$drs_out> and die "dgit-repos-server sent too much output";
    $drs_out->error and die $!;
    say_log
      "job $id, tag $tag_name: ".substr($drs_msg, 8),
      "job $id, tag $tag_name final disposition: $drs_status";
    $mngr->send($_) for $drs_msg, $drs_status;

    (waitpid $drs_child, 0) == $drs_child or die $!;

    die sprintf "dgit-repos-server %s", waitstatusmsg() if $?;

    unless ($retain_tmp) {
      # Most virtualisation backends will take care of this, but it's not
      # guaranteed by the protocol.
      my @virt_cmd = map uri_unescape($_), split /,/, $virt_cmd_enclist;
      system $ssh, @ssh_opts, $builder, map shellquote($_),
	@virt_cmd, qw(rm -rf), $virt_dir;
      $? == 0 or warn "WARNING: failed to remove $virt_dir in builder virt";
    }

    $virt->send("quit");

    # Spec says we should expect `ok` but at least adt-virt-null
    # doesn't send it.  #1092808.  Anyway, we can safely waitpid without
    # risk of deadlock - the pipe would fit an ok if it sent one.
    (waitpid $virt->get_pid(), 0) == $virt->get_pid() or die $!;
    die sprintf "autopkgtest virt server: %s", waitstatusmsg() if $?;
}
