#!/usr/bin/env perl

# ===========================================================================
#
#                            PUBLIC DOMAIN NOTICE
#            National Center for Biotechnology Information (NCBI)
#
#  This software/database is a "United States Government Work" under the
#  terms of the United States Copyright Act.  It was written as part of
#  the author's official duties as a United States Government employee and
#  thus cannot be copyrighted.  This software/database is freely available
#  to the public for use. The National Library of Medicine and the U.S.
#  Government do not place any restriction on its use or reproduction.
#  We would, however, appreciate having the NCBI and the author cited in
#  any work or product based on this material.
#
#  Although all reasonable efforts have been taken to ensure the accuracy
#  and reliability of the software and data, the NLM and the U.S.
#  Government do not and cannot warrant the performance or results that
#  may be obtained by using this software or data. The NLM and the U.S.
#  Government disclaim all warranties, express or implied, including
#  warranties of performance, merchantability or fitness for any particular
#  purpose.
#
# ===========================================================================
#
# File Name:  nquire
#
# Author:  Jonathan Kans
#
# Version Creation Date:   8/20/12
#
# ==========================================================================

# Entrez Direct - EDirect

# use strict;
use warnings;

my ($LibDir, $ScriptName);

use File::Spec;

# nquire version number

$version = "10.5";

BEGIN
{
  my $Volume;
  ($Volume, $LibDir, $ScriptName) = File::Spec->splitpath($0);
  $LibDir = File::Spec->catpath($Volume, $LibDir, '');
  if (my $RealPathname = eval {readlink $0}) {
    do {
      $RealPathname = File::Spec->rel2abs($RealPathname, $LibDir);
      ($Volume, $LibDir, undef) = File::Spec->splitpath($RealPathname);
      $LibDir = File::Spec->catpath($Volume, $LibDir, '')
    } while ($RealPathname = eval {readlink $RealPathname});
  } else {
    $LibDir = File::Spec->rel2abs($LibDir)
  }
  $LibDir .= '/aux/lib/perl5';
}
use lib $LibDir;

use LWP::UserAgent;
use POSIX;
use URI::Escape;
use Net::FTP;

# definitions

use constant false => 0;
use constant true  => 1;

# utility subroutines

sub clearflags {
  %macros = ();
  $agent = "Nquire/1.0";
  $alias = "";
  $debug = false;
  $http = "";
  $output = "";
}

sub map_macros {

  $qury = shift (@_);

  if ( $qury !~ /\(#/ ) {
    return $qury;
  }

  if ( scalar (keys %macros) > 0 ) {
    for ( keys %macros ) {
      $ky = $_;
      $vl = $macros{$_};
      $qury =~ s/\((\#$ky)\)/$vl/g;
    }
  }

  return $qury;
}

sub read_aliases {

  if ( $alias ne "" ) {
    if (open (my $PROXY_IN, $alias)) {
      while ( $thisline = <$PROXY_IN> ) {
        $thisline =~ s/\r//;
        $thisline =~ s/\n//;
        $thisline =~ s/ +/ /g;
        $thisline =~ s/> </></g;

        if ( $thisline =~ /(.+)\t(.+)/ ) {
          $ky = $1;
          $vl = $2;
          $vl =~ s/\"//g;
          $macros{"$ky"} = "$vl";
        }
      }
      close ($PROXY_IN);
    } else {
      print STDERR "Unable to open alias file '$alias'\n";
    }
  }
}

# send actual query

sub do_post {

  $urlx = shift (@_);
  $argx = shift (@_);

  $rslt = "";

  if ( $debug ) {
    if ( $argx ne "" ) {
      print STDERR "URL: $urlx?$argx\n\n";
    } else {
      print STDERR "URL: $urlx\n\n";
    }
  }

  if ( $http eq "get" or $http eq "GET" ) {
    if ( $argx ne "" ) {
      $urlx .= "?";
      $urlx .= "$argx";
    }

    $usragnt = new LWP::UserAgent (timeout => 300);
    $usragnt->agent( "$agent" );

    $res = $usragnt->get ( $urlx );

    if ( $res->is_success) {
      $rslt = $res->content;
    } elsif ( $debug ) {
      print STDERR "STATUS: " . $res->status_line . "\n";
    }

    if ( $rslt eq "" and $debug ) {
      print STDERR "No do_get output returned from '$urlx'\n";
    }

    if ( $debug ) {
      print STDERR "$rslt\n";
    }

    return $rslt;
  }

  $usragnt = new LWP::UserAgent (timeout => 300);
  $usragnt->agent( "$agent" );

  $req = new HTTP::Request POST => "$urlx";
  $req->content_type('application/x-www-form-urlencoded');
  $req->content("$argx");

  $res = $usragnt->request ( $req );

  if ( $res->is_success) {
    $rslt = $res->content;
  } elsif ( $debug ) {
    print STDERR "STATUS: " . $res->status_line . "\n";
  }

  if ( $rslt eq "" && $debug ) {
    if ( $argx ne "" ) {
      $urlx .= "?";
      $urlx .= "$argx";
    }
    print STDERR "No do_post output returned from '$urlx'\n";
  }

  if ( $debug ) {
    print STDERR "$rslt\n";
  }

  return $rslt;
}

# uri_escape with backslash exceptions

sub do_uri_escape {

  $patx = shift (@_);

  $rslt = "";

  while ( $patx ne "" ) {
    if ( $patx =~ /^\\\\(.+)/ ) {
      $rslt .= "\\";
      $patx = $1;
    } elsif ( $patx =~ /^\\(.)(.+)/ ) {
      $rslt .= $1;
      $patx = $2;
    } elsif ( $patx =~ /^(.)(.+)/ ) {
      $rslt .= uri_escape ($1);
      $patx = $2;
    } elsif ( $patx =~ /^(.)/ ) {
      $rslt .= uri_escape ($1);
      $patx = "";
    }
  }

  return $rslt;
}

# nquire executes an external URL query from command line arguments

my $nquire_help = qq{
Query Commands

  -ftp         Uses FTP instead of HTTP
  -get         Uses HTTP GET instead of POST
  -url         Base URL for external search

Documentation

  -help        Print this document
  -examples    Examples of advanced queries
  -version     Print version number

Examples

  nquire -get -url "http://collections.mnh.si.edu/services/resolver/resolver.php" \\
    -voucher "Birds:625456" |
  xtract -pattern Result -element ScientificName Country

  nquire -get -url http://w1.weather.gov/xml/current_obs/KSFO.xml |
  xtract -pattern current_observation -tab "\\n" \\
    -element weather temp_f wind_dir wind_mph

  nquire -url "https://eutils.ncbi.nlm.nih.gov/entrez/eutils" elink.fcgi \\
    -dbfrom protein -db protein -cmd neighbor -linkname protein_protein -id NP_476532.1

  nquire -eutils efetch.fcgi -db pubmed -id 2539356 -rettype medline -retmode text

  nquire -eutils esummary.fcgi -db pubmed -id 2539356 -version 2.0

  nquire -eutils esearch.fcgi -db pubmed -term "transposition immunity Tn3" |
  xtract -pattern eSearchResult -element QueryTranslation

  nquire -ftp ftp.ncbi.nlm.nih.gov pub/gdp ideogram_9606_GCF_000001305.14_850_V1 |
  grep acen | cut -f 1,2,6,7 | grep "^X\\t"

};

my $nquire_examples = qq{
Medical Subject Headings

  nquire -get -url "http://id.nlm.nih.gov/mesh/sparql" \\
    -query "PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> \\
      SELECT DISTINCT ?class FROM <http://id.nlm.nih.gov/mesh> \\
      WHERE { ?s rdf:type ?class } ORDER BY ?class" |
  xtract -pattern result -pfx "meshv:" -first "uri[http://id.nlm.nih.gov/mesh/vocab#|]"

  meshv:AllowedDescriptorQualifierPair
  meshv:CheckTag
  meshv:Concept
  meshv:DisallowedDescriptorQualifierPair
  meshv:GeographicalDescriptor
  meshv:PublicationType
  meshv:Qualifier
  meshv:SCR_Chemical
  meshv:SCR_Disease
  meshv:SCR_Organism
  meshv:SCR_Protocol
  meshv:Term
  meshv:TopicalDescriptor
  meshv:TreeNumber

MeSH Predicates

  nquire -get -url "http://id.nlm.nih.gov/mesh/sparql" \\
    -query "SELECT DISTINCT ?p FROM <http://id.nlm.nih.gov/mesh> WHERE { ?s ?p ?o } ORDER BY ?p" |
  xtract -pattern result -pfx "meshv:" -first "uri[http://id.nlm.nih.gov/mesh/vocab#|]"

  meshv:abbreviation
  meshv:active
  meshv:allowableQualifier
  meshv:altLabel
  meshv:annotation
  meshv:broaderConcept
  meshv:broaderDescriptor
  meshv:broaderQualifier
  meshv:casn1_label
  meshv:concept
  meshv:considerAlso
  meshv:dateCreated
  meshv:dateEstablished
  meshv:dateRevised
  meshv:entryVersion
  meshv:frequency
  meshv:hasDescriptor
  meshv:hasQualifier
  meshv:historyNote
  meshv:identifier
  meshv:indexerConsiderAlso
  meshv:lastActiveYear
  meshv:lexicalTag
  meshv:mappedTo
  meshv:narrowerConcept
  meshv:nlmClassificationNumber
  meshv:note
  meshv:onlineNote
  meshv:parentTreeNumber
  meshv:pharmacologicalAction
  meshv:prefLabel
  meshv:preferredConcept
  meshv:preferredMappedTo
  meshv:preferredTerm
  meshv:previousIndexing
  meshv:publicMeSHNote
  meshv:registryNumber
  meshv:relatedConcept
  meshv:relatedRegistryNumber
  meshv:scopeNote
  meshv:seeAlso
  meshv:sortVersion
  meshv:source
  meshv:term
  meshv:thesaurusID
  meshv:treeNumber
  meshv:useInstead

WikiData Predicate List

  nquire -url "https://query.wikidata.org/sparql" \\
    -query "SELECT ?property ?propertyType ?propertyLabel \\
      ?propertyDescription ?propertyAltLabel WHERE { \\
      ?property wikibase:propertyType ?propertyType . SERVICE wikibase:label \\
      { bd:serviceParam wikibase:language '[AUTO_LANGUAGE],en'. } } \\
      ORDER BY ASC(xsd:integer(STRAFTER(STR(?property), 'P')))" |
  xtract -pattern result -first "uri[http://www.wikidata.org/entity/|]" -first literal

Selected WikiData Predicates

  P6       head of government
  P16      highway system
  P17      country
  P19      place of birth
  P21      sex or gender
  P22      father
  P25      mother
  P26      spouse
  P30      continent
  P31      instance of
  P35      head of state
  P36      capital
  P40      child
  P105     taxon rank
  P660     EC enzyme classification
  P672     MeSH Code
  P680     molecular function
  P681     cell component
  P682     biological process
  P685     NCBI Taxonomy ID
  P698     PubMed ID
  P699     Disease Ontology ID
  P932     PMCID
  P1340    eye color
  P2067    mass
  P2410    WikiPathways ID
  P2888    exact match

Vitamin Binding Site

  nquire -get -url "http://www.wikidata.org/entity/Q22679758" |
  transmute -j2x |
  xtract -pattern entities -group claims -block P527 -element "value\@id"

Children of JS Bach

  nquire -url "https://query.wikidata.org/sparql" \\
    -query "SELECT ?child ?childLabel WHERE \\
      { ?child wdt:P22 wd:Q1339. SERVICE wikibase:label \\
        { bd:serviceParam wikibase:language '[AUTO_LANGUAGE],en'. } }" |
  xtract -pattern result -block binding -if "\@name" -equals childLabel -element literal

Eye Color Frequency

  nquire -url "https://query.wikidata.org/sparql" \\
    -query "SELECT ?eyeColorLabel WHERE \\
      { ?human wdt:P31 wd:Q5. ?human wdt:P1340 ?eyeColor. SERVICE wikibase:label \\
        { bd:serviceParam wikibase:language '[AUTO_LANGUAGE],en'. } }" |
  xtract -pattern result -element literal |
  sort-uniq-count-rank

Federated Query

  nquire -url "https://query.wikidata.org/sparql" \\
    -query " \\
      PREFIX wp:      <http://vocabularies.wikipathways.org/wp#> \\
      PREFIX dcterms:  <http://purl.org/dc/terms/> \\
      PREFIX dc:      <http://purl.org/dc/elements/1.1/> \\
      SELECT DISTINCT ?metabolite1Label ?metabolite2Label ?mass1 ?mass2 WITH { \\
        SELECT ?metabolite1 ?metabolite2 WHERE { \\
          ?pathwayItem wdt:P2410 'WP706'; \\
                       wdt:P2888 ?pwIri. \\
          SERVICE <http://sparql.wikipathways.org/> { \\
            ?pathway dc:identifier ?pwIri. \\
            ?interaction rdf:type wp:Interaction; \\
                         wp:participants ?wpmb1, ?wpmb2; \\
                         dcterms:isPartOf ?pathway. \\
            FILTER (?wpmb1 != ?wpmb2) \\
            ?wpmb1 wp:bdbWikidata ?metabolite1. \\
            ?wpmb2 wp:bdbWikidata ?metabolite2. \\
          } \\
        } \\
      } AS %metabolites WHERE { \\
        INCLUDE %metabolites. \\
        ?metabolite1 wdt:P2067 ?mass1. \\
        ?metabolite2 wdt:P2067 ?mass2. \\
        SERVICE wikibase:label { bd:serviceParam wikibase:language '[AUTO_LANGUAGE],en'. } \\
      }" |
  xtract -pattern result -block binding -element "binding\@name" literal

};

sub nquire {

  # nquire -url http://... -tag value -tag value | ...

  $url = "";
  $arg = "";
  $pfx = "";
  $amp = "";
  $pat = "";

  @args = @ARGV;
  $max = scalar @args;

  if ( $max < 1 ) {
    return;
  }

  if ( $ARGV[0] eq "-version" ) {
    print "nquire $version\n";
    return;
  }

  if ( $ARGV[0] eq "-help" ) {
    print "nquire $version\n";
    print $nquire_help;
    return;
  }

  # -examples prints advanced sparql queries (undocumented)

  if ( $ARGV[0] eq "-examples" or $ARGV[0] eq "-example" or $ARGV[0] eq "-extras" or $ARGV[0] eq "-extra" ) {
    print "nquire $version\n";
    print $nquire_examples;
    return;
  }

  if ( $max < 2 ) {
    return;
  }

  $i = 0;

  # if present, -debug must be first argument, only prints generated URL (undocumented)

  if ( $i < $max ) {
    $pat = $args[$i];
    if ( $pat eq "-debug" ) {
      $i++;
      $debug = true;
    }
  }

  # if present, -ftp must be next

  if ( $i < $max ) {
    $pat = $args[$i];
    if ( $pat eq "-ftp" ) {
      $i++;
      if ( $i < $max ) {
        my $server = $args[$i];
        $i++;
        if ( $i < $max ) {
          my $dir = $args[$i];
          $i++;
          if ( $i < $max ) {
            my $fl = $args[$i];

            my $ftp = new Net::FTP($server, Passive => 1)
              or die "Unable to connect to FTP server: $!";

            $ftp->login or die "Unable to log in to FTP server: ", $ftp->message;
            $ftp->cwd($dir) or die "Unable to change to $dir: ", $ftp->message;
            $ftp->binary or warn "Unable to set binary mode: ", $ftp->message;

            if (! -e $fl) {
              if (! $ftp->get($fl, "/dev/stdout") ) {
                my $msg = $ftp->message;
                chomp $msg;
                print STDERR "\nFAILED TO DOWNLOAD:\n\n$fl ($msg\n";
              }
            }
          }
        }
      }
      return;
    }
  }

  # if present, -http get or -get must be next

  # nquire -get -url "http://collections.mnh.si.edu/services/resolver/resolver.php" -voucher "Birds:625456"

  if ( $i < $max ) {
    $pat = $args[$i];
    if ( $pat eq "-http" ) {
      $i++;
      if ( $i < $max ) {
        $http = $args[$i];
        $i++;
      }
    } elsif ( $pat eq "-get" ) {
      $i++;
      $http = "get";
    }
  }

  # if present, -agent must be next argument (undocumented)

  if ( $i < $max ) {
    $pat = $args[$i];
    if ( $pat eq "-agent" ) {
      $i++;
      if ( $i < $max ) {
        $agent = $args[$i];
        $i++;
      }
    }
  }

  # read file of keyword shortcuts for URL expansion

  if ( $i < $max ) {
    $pat = $args[$i];
    if ( $pat eq "-alias" ) {
      $i++;
      if ( $i < $max ) {
        $alias = $args[$i];
        if ( $alias ne "" ) {
          read_aliases ();
        }
        $i++;
      }
    }
  }

  # read URL

  if ( $i < $max ) {
    $pat = $args[$i];
    if ( $pat eq "-url" ) {
      $i++;
      if ( $i < $max ) {
        $url = $args[$i];
        $url = map_macros ($url);
        $i++;
      }
    } elsif ( $pat eq "-ncbi" ) {
      # shortcut for ncbi base (undocumented)
      $i++;
      if ( $i < $max ) {
        $url = "https://www.ncbi.nlm.nih.gov";
      }
    } elsif ( $pat eq "-eutils" ) {
      # shortcut for eutils base (undocumented)
      $i++;
      if ( $i < $max ) {
        $url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils";
      }
    } elsif ( $pat eq "-test" ) {
      # shortcut for eutilstest base (undocumented)
      $i++;
      if ( $i < $max ) {
        $url = "https://eutilstest.ncbi.nlm.nih.gov/entrez/eutils";
      }
    } elsif ( $pat eq "-hydra" ) {
      # internal citation match request (undocumented)
      $i++;
      if ( $i < $max ) {
        $url = "https://www.ncbi.nlm.nih.gov/projects/hydra/hydra_search.cgi";
        $pat = $args[$i];
        $pat = map_macros ($pat);
        $enc = do_uri_escape ($pat);
        $arg="search=pubmed_search_citation_top_20.1&query=$enc";
        $amp = "&";
        $i++;
      }
    } elsif ( $pat eq "-revhist" ) {
      # internal sequence revision history request (undocumented)
      $i++;
      if ( $i < $max ) {
        $url = "https://www.ncbi.nlm.nih.gov/sviewer/girevhist.cgi";
        $pat = $args[$i];
        $arg="cmd=seqid&txt=on&seqid=asntext&os=PUBSEQ_OS&val=$pat";
        $amp = "&";
        $i++;
      }
    } elsif ( $pat eq "-biosample" ) {
      # internal biosample_chk request on live database (undocumented)
      $i++;
      if ( $i < $max ) {
        $http = "get";
        $url = "https://api-int.ncbi.nlm.nih.gov/biosample/fetch";
        $bid = $args[$i];
        $arg="format=source&id=$bid";
        $amp = "&";
        $i++;
      }
    } elsif ( $pat eq "-biosample-dev" ) {
      # internal biosample_chk request on development database (undocumented)
      $i++;
      if ( $i < $max ) {
        $http = "get";
        $url = "https://dev-api-int.ncbi.nlm.nih.gov/biosample/fetch";
        $bid = $args[$i];
        $arg="format=source&id=$bid";
        $amp = "&";
        $i++;
      }
    }
  }

  if ( $url eq "" ) {
    return;
  }

  # hard-coded URL aliases for common NCBI web sites

  if ( $url =~ /\(#/ ) {

    $ky = "ncbi_url";
    if ( $url =~ /\(#$ky\)/ ) {
      $vl = "https://www.ncbi.nlm.nih.gov";
      $url =~ s/\((\#$ky)\)/$vl/g;
    }

    $ky = "eutils_url";
    if ( $url =~ /\(#$ky\)/ ) {
      $vl = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils";
      $url =~ s/\((\#$ky)\)/$vl/g;
    }
  }

  # arguments before next minus are added to base URL as /value

  $go_on = true;
  while ( $i < $max and $go_on ) {
    $pat = $args[$i];
    if ( $pat =~ /^-(.+)/ ) {
      $go_on = false;
    } else {
      $pat = map_macros ($pat);
      $url .= "/" . $pat;
      $i++;
    }
  }

  # now expect tag with minus and value[s] without, add as &tag=value[,value]

  while ( $i < $max ) {
    $pat = $args[$i];
    if ( $pat =~ /^-(.+)/ ) {
      $pat = $1;
      $pfx = $amp . "$pat=";
      $amp = "";
    } else {
      $pat =~ s/^\\-/-/g;
      $pat = map_macros ($pat);
      $enc = do_uri_escape ($pat);
      $arg .= $pfx . $enc;
      $pfx = ",";
      $amp = "&";
    }
    $i++;
  }

  # perform query
  $output = do_post ($url, $arg);

  print "$output";
}

# initialize

clearflags ();

# execute URL request

nquire ();

# close input and output files

close (STDIN);
close (STDOUT);
close (STDERR);
