/*
* ===========================================================================
*
*                            PUBLIC DOMAIN NOTICE
*               National Center for Biotechnology Information
*
*  This software/database is a "United States Government Work" under the
*  terms of the United States Copyright Act.  It was written as part of
*  the author's official duties as a United States Government employee and
*  thus cannot be copyrighted.  This software/database is freely available
*  to the public for use. The National Library of Medicine and the U.S.
*  Government have not placed any restriction on its use or reproduction.
*
*  Although all reasonable efforts have been taken to ensure the accuracy
*  and reliability of the software and data, the NLM and the U.S.
*  Government do not and cannot warrant the performance or results that
*  may be obtained by using this software or data. The NLM and the U.S.
*  Government disclaim all warranties, express or implied, including
*  warranties of performance, merchantability or fitness for any particular
*  purpose.
*
*  Please cite the author in any work or product based on this material.
*
* ===========================================================================
*
* Author:  Jonathan Kans
*
*/

#include <ncbi_pch.hpp>

#include <util/unicode.hpp>
#include <util/static_set.hpp>
#include <util/static_map.hpp>

#include <objects/misc/sequence_macros.hpp>

#include <objmgr/feat_ci.hpp>
#include <objmgr/seqdesc_ci.hpp>
#include <objmgr/seq_map_ci.hpp>

#include <objmgr/util/indexer.hpp>
#include <objmgr/util/sequence.hpp>

BEGIN_NCBI_SCOPE
BEGIN_SCOPE(objects)


// CSeqEntryIndex

// Constructors take top-level sequence object, create a CRef<CSeqMasterIndex>, and call its initializer
CSeqEntryIndex::CSeqEntryIndex (CSeq_entry_Handle& topseh, EPolicy policy, TFlags flags, int depth)

{
    m_Idx.Reset(new CSeqMasterIndex);
    m_Idx->x_Initialize(topseh, policy, flags, depth);
}

CSeqEntryIndex::CSeqEntryIndex (CSeq_entry& topsep, EPolicy policy, TFlags flags, int depth)

{
    m_Idx.Reset(new CSeqMasterIndex);
    m_Idx->x_Initialize(topsep, policy, flags, depth);
}

CSeqEntryIndex::CSeqEntryIndex (CBioseq_set& seqset, EPolicy policy, TFlags flags, int depth)

{
    m_Idx.Reset(new CSeqMasterIndex);
    m_Idx->x_Initialize(seqset, policy, flags, depth);
}

CSeqEntryIndex::CSeqEntryIndex (CBioseq& bioseq, EPolicy policy, TFlags flags, int depth)

{
    m_Idx.Reset(new CSeqMasterIndex);
    m_Idx->x_Initialize(bioseq, policy, flags, depth);
}

CSeqEntryIndex::CSeqEntryIndex (CSeq_submit& submit, EPolicy policy, TFlags flags, int depth)

{
    m_Idx.Reset(new CSeqMasterIndex);
    m_Idx->x_Initialize(submit, policy, flags, depth);
}

CSeqEntryIndex::CSeqEntryIndex (CSeq_entry& topsep, CSubmit_block &sblock, EPolicy policy, TFlags flags, int depth)

{
    m_Idx.Reset(new CSeqMasterIndex);
    m_Idx->x_Initialize(topsep, sblock, policy, flags, depth);
}

CSeqEntryIndex::CSeqEntryIndex (CSeq_entry& topsep, CSeq_descr &descr, EPolicy policy, TFlags flags, int depth)

{
    m_Idx.Reset(new CSeqMasterIndex);
    m_Idx->x_Initialize(topsep, descr, policy, flags, depth);
}

// Get first Bioseq index
CRef<CBioseqIndex> CSeqEntryIndex::GetBioseqIndex (void)

{
    return m_Idx->GetBioseqIndex();
}

// Get Nth Bioseq index
CRef<CBioseqIndex> CSeqEntryIndex::GetBioseqIndex (int n)

{
    return m_Idx->GetBioseqIndex(n);
}

// Get Bioseq index by accession
CRef<CBioseqIndex> CSeqEntryIndex::GetBioseqIndex (const string& accn)

{
    return m_Idx->GetBioseqIndex(accn);
}

// Get Bioseq index by handle (via best Seq-id string)
CRef<CBioseqIndex> CSeqEntryIndex::GetBioseqIndex (CBioseq_Handle bsh)

{
    return m_Idx->GetBioseqIndex(bsh);
}

// // Get Bioseq index by feature
CRef<CBioseqIndex> CSeqEntryIndex::GetBioseqIndex (const CMappedFeat& mf)

{
    return m_Idx->GetBioseqIndex(mf);
}

// Get Bioseq index by sublocation
CRef<CBioseqIndex> CSeqEntryIndex::GetBioseqIndex (const CSeq_loc& loc)

{
    return m_Idx->GetBioseqIndex(loc);
}

// Get Bioseq index by subrange
CRef<CBioseqIndex> CSeqEntryIndex::GetBioseqIndex (const string& accn, int from, int to, bool rev_comp)

{
    return m_Idx->GetBioseqIndex(accn, from, to, rev_comp);
}

CRef<CBioseqIndex> CSeqEntryIndex::GetBioseqIndex (int from, int to, bool rev_comp)

{
    return m_Idx->GetBioseqIndex("", from, to, rev_comp);
}

const vector<CRef<CBioseqIndex>>& CSeqEntryIndex::GetBioseqIndices(void)

{
    return m_Idx->GetBioseqIndices();
}

const vector<CRef<CSeqsetIndex>>& CSeqEntryIndex::GetSeqsetIndices(void)

{
    return m_Idx->GetSeqsetIndices();
}

bool CSeqEntryIndex::IsFetchFailure(void)

{
    return m_Idx->IsFetchFailure();
}


// CSeqMasterIndex

// Initializers take top-level sequence object, create Seq-entry wrapper if necessary
void CSeqMasterIndex::x_Initialize (CSeq_entry_Handle& topseh, CSeqEntryIndex::EPolicy policy, CSeqEntryIndex::TFlags flags, int depth)
{
    m_Policy = policy;
    m_Flags = flags;
    m_Depth = depth;

    m_Tseh = topseh.GetTopLevelEntry();
    CConstRef<CSeq_entry> tcsep = m_Tseh.GetCompleteSeq_entry();
    CSeq_entry& topsep = const_cast<CSeq_entry&>(*tcsep);
    topsep.Parentize();
    m_Tsep.Reset(&topsep);

    try {
        // Code copied from x_Init, then modified to reuse existing scope from CSeq_entry_Handle
        m_Scope.Reset( &m_Tseh.GetScope() );
        if ( !m_Scope ) {
            /* raise hell */;
        }

        m_Counter.Set(0);

        // Populate vector of CBioseqIndex objects representing local Bioseqs in blob
        CRef<CSeqsetIndex> noparent;
        x_InitSeqs( *m_Tsep, noparent );
    }
    catch (CException& e) {
        LOG_POST(Error << "Error in CSeqMasterIndex::x_Init: " << e.what());
    }
}

void CSeqMasterIndex::x_Initialize (CSeq_entry& topsep, CSeqEntryIndex::EPolicy policy, CSeqEntryIndex::TFlags flags, int depth)
{
    m_Policy = policy;
    m_Flags = flags;
    m_Depth = depth;

    topsep.Parentize();
    m_Tsep.Reset(&topsep);

    x_Init();
}

void CSeqMasterIndex::x_Initialize (CBioseq_set& seqset, CSeqEntryIndex::EPolicy policy, CSeqEntryIndex::TFlags flags, int depth)
{
    m_Policy = policy;
    m_Flags = flags;
    m_Depth = depth;

    CSeq_entry* parent = seqset.GetParentEntry();
    if (parent) {
        parent->Parentize();
        m_Tsep.Reset(parent);
    } else {
        CRef<CSeq_entry> sep(new CSeq_entry);
        sep->SetSet(seqset);
        sep->Parentize();
        m_Tsep.Reset(sep);
    }

    x_Init();
}

void CSeqMasterIndex::x_Initialize (CBioseq& bioseq, CSeqEntryIndex::EPolicy policy, CSeqEntryIndex::TFlags flags, int depth)
{
    m_Policy = policy;
    m_Flags = flags;
    m_Depth = depth;

    CSeq_entry* parent = bioseq.GetParentEntry();
    if (parent) {
        parent->Parentize();
        m_Tsep.Reset(parent);
    } else {
        CRef<CSeq_entry> sep(new CSeq_entry);
        sep->SetSeq(bioseq);
        sep->Parentize();
        m_Tsep.Reset(sep);
    }

    x_Init();
}

void CSeqMasterIndex::x_Initialize (CSeq_submit& submit, CSeqEntryIndex::EPolicy policy, CSeqEntryIndex::TFlags flags, int depth)
{
    m_Policy = policy;
    m_Flags = flags;
    m_Depth = depth;

    _ASSERT(submit.CanGetData());
    _ASSERT(submit.CanGetSub());
    _ASSERT(submit.GetData().IsEntrys());
    _ASSERT(!submit.GetData().GetEntrys().empty());

    CRef<CSeq_entry> sep = submit.GetData().GetEntrys().front();
    sep->Parentize();
    m_Tsep.Reset(sep);
    m_SbtBlk.Reset(&submit.GetSub());

    x_Init();
}

void CSeqMasterIndex::x_Initialize (CSeq_entry& topsep, CSubmit_block &sblock, CSeqEntryIndex::EPolicy policy, CSeqEntryIndex::TFlags flags, int depth)
{
    m_Policy = policy;
    m_Flags = flags;
    m_Depth = depth;

    topsep.Parentize();
    m_Tsep.Reset(&topsep);
    m_SbtBlk.Reset(&sblock);

    x_Init();
}

void CSeqMasterIndex::x_Initialize (CSeq_entry& topsep, CSeq_descr &descr, CSeqEntryIndex::EPolicy policy, CSeqEntryIndex::TFlags flags, int depth)
{
    m_Policy = policy;
    m_Flags = flags;
    m_Depth = depth;

    topsep.Parentize();
    m_Tsep.Reset(&topsep);
    m_TopDescr.Reset(&descr);

    x_Init();
}

// At end of program, poll all Bioseqs to check for far fetch failure flag
bool CSeqMasterIndex::IsFetchFailure (void)

{
    for (auto& bsx : m_BsxList) {
        if (bsx->IsFetchFailure()) {
            return true;
        }
    }
    return false;
}

// FindBestIdChoice modified from feature_item.cpp
static int s_IdxSeqIdHandle(const CSeq_id_Handle& idh)
{
    CConstRef<CSeq_id> id = idh.GetSeqId();
    CRef<CSeq_id> id_non_const
        (const_cast<CSeq_id*>(id.GetPointer()));
    return CSeq_id::Score(id_non_const);
}

static CSeq_id_Handle s_IdxFindBestIdChoice(const CBioseq_Handle::TId& ids)
{
    CBestChoiceTracker< CSeq_id_Handle, int (*)(const CSeq_id_Handle&) > 
        tracker(s_IdxSeqIdHandle);

    ITERATE( CBioseq_Handle::TId, it, ids ) {
        switch( (*it).Which() ) {
            case CSeq_id::e_Genbank:
            case CSeq_id::e_Embl:
            case CSeq_id::e_Ddbj:
            case CSeq_id::e_Gi:
            case CSeq_id::e_Other:
            case CSeq_id::e_General:
            case CSeq_id::e_Tpg:
            case CSeq_id::e_Tpe:
            case CSeq_id::e_Tpd:
            case CSeq_id::e_Gpipe:
                tracker(*it);
                break;
            default:
                break;
        }
    }
    return tracker.GetBestChoice();
}

static string s_IdxGetBestIdString(CBioseq_Handle bsh)

{
    if (bsh) {
        const CBioseq_Handle::TId& ids = bsh.GetId();
        if (! ids.empty()) {
            CSeq_id_Handle best = s_IdxFindBestIdChoice(ids);
            if (best) {
                return best.AsString();
            }
        }
    }

    return "";
}

// Recursively explores from top-level Seq-entry to make flattened vector of CBioseqIndex objects
void CSeqMasterIndex::x_InitSeqs (const CSeq_entry& sep, CRef<CSeqsetIndex> prnt)

{
    if (sep.IsSeq()) {
        // Is Bioseq
        const CBioseq& bsp = sep.GetSeq();
        CBioseq_Handle bsh = m_Scope->GetBioseqHandle(bsp);
        if (bsh) {
            // create CBioseqIndex object for current Bioseq
            CRef<CBioseqIndex> bsx(new CBioseqIndex(bsh, bsp, bsh, prnt, m_Tseh, m_Scope, *this, m_Policy, m_Flags, m_Depth, false));

            // record CBioseqIndex in vector for IterateBioseqs or GetBioseqIndex
            m_BsxList.push_back(bsx);

            // map from accession string to CBioseqIndex object
            const string& accn = bsx->GetAccession();
            m_AccnIndexMap[accn] = bsx;

            // map from handle to best Seq-id string to CBioseqIndex object
            string bestid = s_IdxGetBestIdString(bsh);
            m_BestIdIndexMap[bestid] = bsx;
        }
    } else if (sep.IsSet()) {
        // Is Bioseq-set
        const CBioseq_set& bssp = sep.GetSet();
        CBioseq_set_Handle ssh = m_Scope->GetBioseq_setHandle(bssp);
        if (ssh) {
            // create CSeqsetIndex object for current Bioseq-set
            CRef<CSeqsetIndex> ssx(new CSeqsetIndex(ssh, bssp, prnt));

            // record CSeqsetIndex in vector
            m_SsxList.push_back(ssx);

            if (bssp.CanGetSeq_set()) {
                // recursively explore current Bioseq-set
                for (const CRef<CSeq_entry>& tmp : bssp.GetSeq_set()) {
                    x_InitSeqs(*tmp, ssx);
                }
            }
        }
    }
}

// Common initialization function creates local default CScope
void CSeqMasterIndex::x_Init (void)

{
    try {
        m_Objmgr = CObjectManager::GetInstance();
        if ( !m_Objmgr ) {
            /* raise hell */;
        }

        m_Scope.Reset( new CScope( *m_Objmgr ) );
        if ( !m_Scope ) {
            /* raise hell */;
        }

        m_Counter.Set(0);

        m_Scope->AddDefaults();

        m_Tseh = m_Scope->AddTopLevelSeqEntry( *m_Tsep );

        // Populate vector of CBioseqIndex objects representing local Bioseqs in blob
        CRef<CSeqsetIndex> noparent;
        x_InitSeqs( *m_Tsep, noparent );
    }
    catch (CException& e) {
        LOG_POST(Error << "Error in CSeqMasterIndex::x_Init: " << e.what());
    }
}

// Support for temporary delta sequence referring to subrange of original sequence
CRef<CSeq_id> CSeqMasterIndex::x_MakeUniqueId(void)
{
    CRef<CSeq_id> id(new CSeq_id());
    bool good = false;
    while (!good) {
        id->SetLocal().SetStr("tmp_delta_subset_" + NStr::NumericToString(m_Counter.Add(1)));
        CBioseq_Handle bsh = m_Scope->GetBioseqHandle(*id);
        if (! bsh) {
            good = true;
        }
    }
    return id;
}

CRef<CBioseqIndex> CSeqMasterIndex::x_DeltaIndex(const CSeq_loc& loc)

{
    try {
        // create delta sequence referring to location or range, using temporary local Seq-id
        CBioseq_Handle bsh = m_Scope->GetBioseqHandle(loc);
        CRef<CBioseq> delta(new CBioseq());
        delta->SetId().push_back(x_MakeUniqueId());
        delta->SetInst().Assign(bsh.GetInst());
        delta->SetInst().ResetSeq_data();
        delta->SetInst().ResetExt();
        delta->SetInst().SetRepr(CSeq_inst::eRepr_delta);
        CRef<CDelta_seq> element(new CDelta_seq());
        element->SetLoc().Assign(loc);
        delta->SetInst().SetExt().SetDelta().Set().push_back(element);
        delta->SetInst().SetLength(sequence::GetLength(loc, m_Scope));

        // add to scope
        CBioseq_Handle deltaBsh = m_Scope->AddBioseq(*delta);

        if (deltaBsh) {
            // create CBioseqIndex object for delta Bioseq
            CRef<CSeqsetIndex> noparent;

            CRef<CBioseqIndex> bsx(new CBioseqIndex(deltaBsh, *delta, bsh, noparent, m_Tseh, m_Scope, *this, m_Policy, m_Flags, m_Depth, true));

           return bsx;
        }
    }
    catch (CException& e) {
        LOG_POST(Error << "Error in CSeqMasterIndex::x_DeltaIndex: " << e.what());
    }
    return CRef<CBioseqIndex> ();
}

CConstRef<CSeq_loc> CSeqMasterIndex::x_SubRangeLoc(const string& accn, int from, int to, bool rev_comp)

{
    TAccnIndexMap::iterator it = m_AccnIndexMap.find(accn);
    if (it != m_AccnIndexMap.end()) {
        CRef<CBioseqIndex> bsx = it->second;
        for (const CRef<CSeq_id>& id : bsx->GetBioseq().GetId()) {
            switch (id->Which()) {
                case CSeq_id::e_Other:
                case CSeq_id::e_Genbank:
                case CSeq_id::e_Embl:
                case CSeq_id::e_Ddbj:
                case CSeq_id::e_Tpg:
                case CSeq_id::e_Tpe:
                case CSeq_id::e_Tpd:
                    {
                        CSeq_loc::TStrand strand = eNa_strand_unknown;
                        if (rev_comp) {
                            strand = eNa_strand_minus;
                        }
                        CSeq_id& nc_id = const_cast<CSeq_id&>(*id);
                        // create location from range
                        CConstRef<CSeq_loc> loc(new CSeq_loc(nc_id, from, to, strand));
                        if (loc) {
                           return loc;
                        }
                    }
                    break;
                default:
                    break;
            }
        }
    }
    return CConstRef<CSeq_loc> ();
}

// Get first Bioseq index
CRef<CBioseqIndex> CSeqMasterIndex::GetBioseqIndex (void)

{
    for (auto& bsx : m_BsxList) {
        return bsx;
    }
    return CRef<CBioseqIndex> ();
}

// Get Nth Bioseq index
CRef<CBioseqIndex> CSeqMasterIndex::GetBioseqIndex (int n)

{
    for (auto& bsx : m_BsxList) {
        n--;
        if (n > 0) continue;
        return bsx;
    }
    return CRef<CBioseqIndex> ();
}

// Get Bioseq index by accession
CRef<CBioseqIndex> CSeqMasterIndex::GetBioseqIndex (const string& accn)

{
    TAccnIndexMap::iterator it = m_AccnIndexMap.find(accn);
    if (it != m_AccnIndexMap.end()) {
        CRef<CBioseqIndex> bsx = it->second;
        return bsx;
    }
    return CRef<CBioseqIndex> ();
}

// Get Bioseq index by handle (via best Seq-id string)
CRef<CBioseqIndex> CSeqMasterIndex::GetBioseqIndex (CBioseq_Handle bsh)

{
    string bestid = s_IdxGetBestIdString(bsh);
    TBestIdIndexMap::iterator it = m_BestIdIndexMap.find(bestid);
    if (it != m_BestIdIndexMap.end()) {
        CRef<CBioseqIndex> bsx = it->second;
        return bsx;
    }
    return CRef<CBioseqIndex> ();
}

// // Get Bioseq index by feature
CRef<CBioseqIndex> CSeqMasterIndex::GetBioseqIndex (const CMappedFeat& mf)

{
    CSeq_id_Handle idh = mf.GetLocationId();
    CBioseq_Handle bsh = m_Scope->GetBioseqHandle(idh);
    return GetBioseqIndex(bsh);
}

// Get Bioseq index by sublocation
CRef<CBioseqIndex> CSeqMasterIndex::GetBioseqIndex (const CSeq_loc& loc)

{
    CRef<CBioseqIndex> bsx = x_DeltaIndex(loc);

    if (bsx) {
        return bsx;
    }
    return CRef<CBioseqIndex> ();
}

// Get Bioseq index by subrange
CRef<CBioseqIndex> CSeqMasterIndex::GetBioseqIndex (const string& accn, int from, int to, bool rev_comp)

{
    string accession = accn;
    if (accession.empty()) {
        CRef<CBioseqIndex> bsx = GetBioseqIndex();
        if (bsx) {
            accession = bsx->GetAccession();
        }
    }

    if (! accession.empty()) {
        CConstRef<CSeq_loc> loc = x_SubRangeLoc(accession, from, to, rev_comp);

        if (loc) {
            return GetBioseqIndex(*loc);
        }
    }
    return CRef<CBioseqIndex> ();
}

CRef<CBioseqIndex> CSeqMasterIndex::GetBioseqIndex (int from, int to, bool rev_comp)

{
    return GetBioseqIndex("", from, to, rev_comp);
}

// Allow access to internal vectors for application to use in iterators
const vector<CRef<CBioseqIndex>>& CSeqMasterIndex::GetBioseqIndices(void)

{
    return m_BsxList;
}

const vector<CRef<CSeqsetIndex>>& CSeqMasterIndex::GetSeqsetIndices(void)

{
    return m_SsxList;
}


// CSeqsetIndex

// Constructor
CSeqsetIndex::CSeqsetIndex (CBioseq_set_Handle ssh,
                            const CBioseq_set& bssp,
                            CRef<CSeqsetIndex> prnt)
    : m_Ssh(ssh),
      m_Bssp(bssp),
      m_Prnt(prnt)
{
    m_Class = CBioseq_set::eClass_not_set;

    if (ssh.IsSetClass()) {
        m_Class = ssh.GetClass();
    }
}


// CBioseqIndex

// Constructor
CBioseqIndex::CBioseqIndex (CBioseq_Handle bsh,
                            const CBioseq& bsp,
                            CBioseq_Handle obsh,
                            CRef<CSeqsetIndex> prnt,
                            CSeq_entry_Handle tseh,
                            CRef<CScope> scope,
                            CSeqMasterIndex& idx,
                            CSeqEntryIndex::EPolicy policy,
                            CSeqEntryIndex::TFlags flags,
                            int depth,
                            bool surrogate)
    : m_Bsh(bsh),
      m_Bsp(bsp),
      m_OrigBsh(obsh),
      m_Prnt(prnt),
      m_Tseh(tseh),
      m_Scope(scope),
      m_Idx(&idx),
      m_Policy(policy),
      m_Flags(flags),
      m_Depth(depth),
      m_Surrogate(surrogate)
{
    m_FetchFailure = false;

    m_GapsInitialized = false;
    m_DescsInitialized = false;
    m_FeatsInitialized = false;

    m_HasSourceFeats = false;

    m_Accession.clear();

    for (CSeq_id_Handle sid : obsh.GetId()) {
        switch (sid.Which()) {
            case CSeq_id::e_Other:
            case CSeq_id::e_Genbank:
            case CSeq_id::e_Embl:
            case CSeq_id::e_Ddbj:
            case CSeq_id::e_Tpg:
            case CSeq_id::e_Tpe:
            case CSeq_id::e_Tpd:
                {
                    CConstRef<CSeq_id> id = sid.GetSeqId();
                    const CTextseq_id& tsid = *id->GetTextseq_Id ();
                    if (tsid.IsSetAccession()) {
                        m_Accession = tsid.GetAccession ();
                    }
                }
                break;
            default:
                break;
        }
    }

    m_IsNA = m_Bsh.IsNa();
    m_IsAA = m_Bsh.IsAa();
    m_Topology = CSeq_inst::eTopology_not_set;
    m_Length = 0;

    m_IsDelta = false;
    m_IsVirtual = false;
    m_IsMap = false;

    if (m_Bsh.IsSetInst()) {
        if (m_Bsh.IsSetInst_Topology()) {
            m_Topology = m_Bsh.GetInst_Topology();
        }

        if (m_Bsh.IsSetInst_Length()) {
            m_Length = m_Bsh.GetInst_Length();
        } else {
            m_Length = m_Bsh.GetBioseqLength();
        }

        if (m_Bsh.IsSetInst_Repr()) {
            CBioseq_Handle::TInst_Repr repr = m_Bsh.GetInst_Repr();
            m_IsDelta = (repr == CSeq_inst::eRepr_delta);
            m_IsVirtual = (repr == CSeq_inst::eRepr_virtual);
            m_IsMap = (repr == CSeq_inst::eRepr_map);
        }
    }

    m_Title.clear();
    m_MolInfo.Reset();
    m_BioSource.Reset();
    m_Taxname.clear();

    m_Biomol = CMolInfo::eBiomol_unknown;
    m_Tech = CMolInfo::eTech_unknown;
    m_Completeness = CMolInfo::eCompleteness_unknown;

    m_ForceOnlyNearFeats = false;
}

// Destructor
CBioseqIndex::~CBioseqIndex (void)

{
    if (m_Surrogate) {
        try {
            m_Scope->RemoveBioseq(m_Bsh);
        } catch (CException&) {
            // presumably still in use; let it be
        }
    }
}

// Gap collection (delayed until needed)
void CBioseqIndex::x_InitGaps (void)

{
    try {
        if (m_GapsInitialized) {
            return;
        }

        m_GapsInitialized = true;

        if (! m_IsDelta) {
            return;
        }

        SSeqMapSelector sel;

        sel.SetFlags(CSeqMap::fFindGap)
           .SetResolveCount(1);

        // explore gaps, pass original target BioseqHandle if using Bioseq sublocation
        for (CSeqMap_CI gap_it(m_OrigBsh, sel); gap_it; ++gap_it) {

            TSeqPos start = gap_it.GetPosition();
            TSeqPos end = gap_it.GetEndPosition() - 1;
            TSeqPos length = gap_it.GetLength();

            // attempt to find CSeq_gap info
            const CSeq_gap * pGap = NULL;
            if( gap_it.IsSetData() && gap_it.GetData().IsGap() ) {
                pGap = &gap_it.GetData().GetGap();
            } else {
                CConstRef<CSeq_literal> pSeqLiteral = gap_it.GetRefGapLiteral();
                if( pSeqLiteral && pSeqLiteral->IsSetSeq_data() ) {
                     const CSeq_data & seq_data = pSeqLiteral->GetSeq_data();
                     if( seq_data.IsGap() ) {
                         pGap = &seq_data.GetGap();
                     }
                }
            }

            CFastaOstream::SGapModText gap_mod_text;
            if( pGap ) {
                CFastaOstream::GetGapModText(*pGap, gap_mod_text);
            }
            string type = gap_mod_text.gap_type;
            vector<string>& evidence = gap_mod_text.gap_linkage_evidences;

            bool isUnknownLength = gap_it.IsUnknownLength();

            // feature name depends on what quals we use
            bool isAssemblyGap = ( ! type.empty() || ! evidence.empty() );

            CRef<CGapIndex> sgx(new CGapIndex(start, end, length, type, evidence, isUnknownLength, isAssemblyGap, *this));
            m_GapList.push_back(sgx);
        }
    }
    catch (CException& e) {
        LOG_POST(Error << "Error in CBioseqIndex::x_InitGaps: " << e.what());
    }
}

// Descriptor collection (delayed until needed)
void CBioseqIndex::x_InitDescs (void)

{
    try {
        if (m_DescsInitialized) {
           return;
        }

        m_DescsInitialized = true;

        // explore descriptors, pass original target BioseqHandle if using Bioseq sublocation
        for (CSeqdesc_CI desc_it(m_OrigBsh); desc_it; ++desc_it) {
            const CSeqdesc& sd = *desc_it;
            CRef<CDescriptorIndex> sdx(new CDescriptorIndex(sd, *this));
            m_SdxList.push_back(sdx);

            switch (sd.Which()) {
                case CSeqdesc::e_Source:
                {
                    if (! m_BioSource) {
                        const CBioSource& biosrc = sd.GetSource();
                        m_BioSource.Reset (&biosrc);
                        if (biosrc.IsSetOrgname()) {
                            const COrg_ref& org = biosrc.GetOrg();
                            if (org.CanGetTaxname()) {
                                m_Taxname = org.GetTaxname();
                            }
                        }
                    }
                    break;
                }
                case CSeqdesc::e_Molinfo:
                {
                    if (! m_MolInfo) {
                        const CMolInfo& molinf = sd.GetMolinfo();
                        m_MolInfo.Reset (&molinf);
                        m_Biomol = molinf.GetBiomol();
                        m_Tech = molinf.GetTech();
                        m_Completeness = molinf.GetCompleteness();
                    }
                    break;
                }
                case CSeqdesc::e_Title:
                {
                    if (m_Title.empty()) {
                        m_Title = sd.GetTitle();
                    }
                    break;
                }
                case CSeqdesc::e_User:
                {
                    const CUser_object& usr = sd.GetUser();
                    if (usr.IsSetType()) {
                        const CObject_id& oi = usr.GetType();
                        if (oi.IsStr()) {
                            const string& type = oi.GetStr();
                            if (NStr::EqualNocase(type, "FeatureFetchPolicy")) {
                                FOR_EACH_USERFIELD_ON_USEROBJECT (uitr, usr) {
                                    const CUser_field& fld = **uitr;
                                    if (fld.IsSetLabel() && fld.GetLabel().IsStr()) {
                                        const string &label_str = GET_FIELD(fld.GetLabel(), Str);
                                        if (! NStr::EqualNocase(label_str, "Policy")) continue;
                                        if (fld.IsSetData() && fld.GetData().IsStr()) {
                                            const string& str = fld.GetData().GetStr();
                                            if (NStr::EqualNocase(str, "OnlyNearFeatures")) {
                                                m_ForceOnlyNearFeats = true;
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                    break;
                }
                default:
                    break;
            }
        }
    }
    catch (CException& e) {
        LOG_POST(Error << "Error in CBioseqIndex::x_InitDescs: " << e.what());
    }
}

// Feature collection (delayed until needed)
void CBioseqIndex::x_InitFeats (void)

{
    try {
        if (m_FeatsInitialized) {
           return;
        }

        if (! m_DescsInitialized) {
            // initialize descriptors first to get m_ForceOnlyNearFeats flag
            x_InitDescs();
        }

        m_FeatsInitialized = true;

        SAnnotSelector sel;

        if (m_Policy != CSeqEntryIndex::fExternal) {
            // unless explicitly desired, exclude external annots
            sel.ExcludeNamedAnnots("CDD")
               .ExcludeNamedAnnots("SNP")
               .ExcludeNamedAnnots("STS");
        }

        if (m_Policy == CSeqEntryIndex::fExhaustive) {

            sel.SetResolveAll();
             // experimental flag forces collection of features from all levels
            sel.SetResolveDepth(kMax_Int);
            // also ignores RefSeq/INSD barrier, far fetch policy user object

        } else if (m_Policy == CSeqEntryIndex::fExternal) {

            // same as fAdaptive, except also allows external annots
            sel.SetResolveAll();
            sel.SetAdaptiveDepth(true);

        } else if (m_Policy == CSeqEntryIndex::fInternal || m_ForceOnlyNearFeats) {

            // do not fetch features from underlying sequence component records
            if (m_Surrogate) {
                // delta with sublocation needs to map features from original Bioseq
                sel.SetResolveAll();
                sel.SetResolveDepth(1);
                sel.SetExcludeExternal();
            } else {
                // otherwise limit collection to local records in top-level Seq-entry
                sel.SetResolveDepth(0);
                sel.SetExcludeExternal();
            }

        } else if (m_Depth > -1) {

            sel.SetResolveAll();
            // explicit depth setting overrides adaptive depth (probably only needed for debugging)
            sel.SetResolveDepth(m_Depth);

        } else if (m_Policy == CSeqEntryIndex::fAdaptive) {

            sel.SetResolveAll();
            // normal situation uses adaptive depth for feature collection,
            // includes barrier between RefSeq and INSD accession types
            sel.SetAdaptiveDepth(true);
        }

        // bit flags exclude specific features
        if ((m_Flags & CSeqEntryIndex::fHideImpFeats) != 0) {
            sel.ExcludeFeatType(CSeqFeatData::e_Imp);
        }
        if ((m_Flags & CSeqEntryIndex::fHideSNPFeats) != 0) {
            sel.ExcludeFeatType(CSeqFeatData::e_Variation);
            sel.ExcludeFeatSubtype(CSeqFeatData::eSubtype_variation);
        }
        if ((m_Flags & CSeqEntryIndex::fHideSTSFeats) != 0) {
            sel.ExcludeFeatSubtype(CSeqFeatData::eSubtype_STS);
        }
        if ((m_Flags & CSeqEntryIndex::fHideExonFeats) != 0) {
            sel.ExcludeFeatSubtype(CSeqFeatData::eSubtype_exon);
        }
        if ((m_Flags & CSeqEntryIndex::fHideIntronFeats) != 0) {
            sel.ExcludeFeatSubtype(CSeqFeatData::eSubtype_intron);
        }
        if ((m_Flags & CSeqEntryIndex::fHideMiscFeats) != 0) {
            sel.ExcludeFeatSubtype(CSeqFeatData::eSubtype_misc_feature);
        }

        // additional common settings
        sel.ExcludeFeatSubtype(CSeqFeatData::eSubtype_non_std_residue)
           .ExcludeFeatSubtype(CSeqFeatData::eSubtype_rsite)
           .ExcludeFeatSubtype(CSeqFeatData::eSubtype_seq);

        sel.SetFeatComparator(new feature::CFeatComparatorByLabel);

        // request exception to capture fetch failure
        sel.SetFailUnresolved();

        // limit feature collection to immediate Bioseq-set parent
        CRef<CSeqsetIndex> prnt = GetParent();
        if (prnt) {
            CBioseq_set_Handle bssh = prnt->GetSeqsetHandle();
            if (bssh) {
                CSeq_entry_Handle pseh = bssh.GetParentEntry();
                if (pseh) {
                    sel.SetLimitSeqEntry(pseh);
                }
            }
        }

        // variables for setting m_BestProteinFeature
        TSeqPos longest = 0;
        CProt_ref::EProcessed bestprocessed = CProt_ref::eProcessed_not_set;
        CProt_ref::EProcessed processed;

        // next gap
        CGapIndex* sgx = NULL;
        if (m_GapList.size() > 0) {
            sgx = m_GapList[0];
        }

        // iterate features on Bioseq
        for (CFeat_CI feat_it(m_Bsh, sel); feat_it; ++feat_it) {
            const CMappedFeat mf = *feat_it;
            CSeq_feat_Handle hdl = mf.GetSeq_feat_Handle();

            CRef<CFeatureIndex> sfx(new CFeatureIndex(hdl, mf, *this));
            m_SfxList.push_back(sfx);

            m_FeatTree.AddFeature(mf);

            // CFeatureIndex from CMappedFeat for use with GetBestGene
            m_FeatIndexMap[mf] = sfx;

            // set specific flags for various feature types
            CSeqFeatData::E_Choice type = sfx->GetType();

            if (type == CSeqFeatData::e_Biosrc) {
                m_HasSourceFeats = true;
                continue;
            }

            if (type == CSeqFeatData::e_Prot && IsAA()) {
                if (! mf.IsSetData ()) continue;
                const CSeqFeatData& sfdata = mf.GetData();
                const CProt_ref& prp = sfdata.GetProt();
                processed = CProt_ref::eProcessed_not_set;
                if (prp.IsSetProcessed()) {
                    processed = prp.GetProcessed();
                }
                const CSeq_loc& loc = mf.GetLocation ();
                TSeqPos prot_length = sequence::GetLength(loc, m_Scope);
                if (prot_length > longest) {
                    m_BestProteinFeature = sfx;
                    longest = prot_length;
                    bestprocessed = processed;
                } else if (prot_length == longest) {
                    // unprocessed 0 > preprotein 1 > mat peptide 2
                    if (processed < bestprocessed) {
                        m_BestProteinFeature = sfx;
                        longest = prot_length;
                        bestprocessed = processed;
                    }
                }
                continue;
            }

            if (type == CSeqFeatData::e_Cdregion && IsNA()) {
            } else if (type == CSeqFeatData::e_Rna && IsAA()) {
            } else if (type == CSeqFeatData::e_Prot && IsNA()) {
            } else {
                continue;
            }

            // index feature for product (CDS -> protein, mRNA -> cDNA, or Prot -> peptide)
            CSeq_id_Handle idh = mf.GetProductId();
            if (idh) {
                CBioseq_Handle pbsh = m_Scope->GetBioseqHandle(idh);
                if (pbsh) {
                    CWeakRef<CSeqMasterIndex> idx = GetSeqMasterIndex();
                    auto idxl = idx.Lock();
                    if (idxl) {
                        CRef<CBioseqIndex> bsxp = idxl->GetBioseqIndex(pbsh);
                        if (bsxp) {
                            bsxp->m_FeatureForProduct = sfx;
                        }
                    }
                }
            }
        }
    }
    catch (CException& e) {
        m_FetchFailure = true;
        LOG_POST(Error << "Error in CBioseqIndex::x_InitFeats: " << e.what());
    }
}

// GetFeatureForProduct allows hypothetical protein defline generator to obtain gene locus tag
CRef<CFeatureIndex> CBioseqIndex::GetFeatureForProduct (void)

{
    if (! m_FeatureForProduct) {
        if (m_Bsh) {
            CFeat_CI fi(m_Bsh,
                        SAnnotSelector(CSeqFeatData::e_Cdregion)
                        .SetByProduct().SetLimitTSE(m_Bsh.GetTSE_Handle()));
            if (! fi) {
                fi = CFeat_CI(m_Bsh,
                              SAnnotSelector(CSeqFeatData::e_Rna)
                              .SetByProduct().SetLimitTSE(m_Bsh.GetTSE_Handle()));
            }
            if (! fi) {
                fi = CFeat_CI(m_Bsh,
                              SAnnotSelector(CSeqFeatData::e_Prot)
                              .SetByProduct().SetLimitTSE(m_Bsh.GetTSE_Handle()));
            }
            if (fi) {
                CMappedFeat mf = *fi;
                CSeq_id_Handle idh = mf.GetLocationId();
                CBioseq_Handle nbsh = m_Scope->GetBioseqHandle(idh);
                if (nbsh) {
                    CWeakRef<CSeqMasterIndex> idx = GetSeqMasterIndex();
                    auto idxl = idx.Lock();
                    if (idxl) {
                        CRef<CBioseqIndex> bsxn = idxl->GetBioseqIndex(nbsh);
                        if (bsxn) {
                            if (! bsxn->m_FeatsInitialized) {
                                bsxn->x_InitFeats();
                            }
                        }
                    }
                }
            }
        }
    }

    return m_FeatureForProduct;
}

// Get Bioseq index containing feature with product pointing to this Bioseq
CWeakRef<CBioseqIndex> CBioseqIndex::GetBioseqForProduct (void)

{
    CRef<CFeatureIndex> sfxp = GetFeatureForProduct();
    if (sfxp) {
        return sfxp->GetBioseqIndex();
    }

    return CWeakRef<CBioseqIndex> ();
}

// GetBestProteinFeature indexes longest protein feature on protein Bioseq
CRef<CFeatureIndex> CBioseqIndex::GetBestProteinFeature (void)

{
    if (! m_FeatsInitialized) {
        x_InitFeats();
    }

    return m_BestProteinFeature;
}

// HasSourceFeats reports whether Bioseq has BioSource features
bool CBioseqIndex::HasSourceFeats (void)

{
    if (! m_FeatsInitialized) {
        x_InitFeats();
    }

    return m_HasSourceFeats;
}

// Common descriptor field getters
const string& CBioseqIndex::GetTitle (void)

{
    if (! m_DescsInitialized) {
        x_InitDescs();
    }

    return m_Title;
}

CConstRef<CMolInfo> CBioseqIndex::GetMolInfo (void)

{
    if (! m_DescsInitialized) {
        x_InitDescs();
    }

    return m_MolInfo;
}

CMolInfo::TBiomol CBioseqIndex::GetBiomol (void)

{
    if (! m_DescsInitialized) {
        x_InitDescs();
    }

    return m_Biomol;
}

CMolInfo::TTech CBioseqIndex::GetTech (void)

{
    if (! m_DescsInitialized) {
        x_InitDescs();
    }

    return m_Tech;
}

CMolInfo::TCompleteness CBioseqIndex::GetCompleteness (void)

{
    if (! m_DescsInitialized) {
        x_InitDescs();
    }

    return m_Completeness;
}

CConstRef<CBioSource> CBioseqIndex::GetBioSource (void)

{
    if (! m_DescsInitialized) {
        x_InitDescs();
    }

    return m_BioSource;
}

const string& CBioseqIndex::GetTaxname (void)

{
    if (! m_DescsInitialized) {
        x_InitDescs();
    }

    return m_Taxname;
}

CRef<CFeatureIndex> CBioseqIndex::GetFeatIndex (const CMappedFeat& mf)

{
    CRef<CFeatureIndex> sfx;

    TFeatIndexMap::iterator it = m_FeatIndexMap.find(mf);
    if (it != m_FeatIndexMap.end()) {
        sfx = it->second;
    }

    return sfx;
}

void CBioseqIndex::GetSequence (int from, int to, string& buffer)

{
    try {
        if (! m_SeqVec) {
            m_SeqVec = new CSeqVector(m_Bsh);
            if (m_SeqVec) {
                m_SeqVec->SetCoding(CBioseq_Handle::eCoding_Iupac);
            }
        }

        if (m_SeqVec) {
            CSeqVector& vec = *m_SeqVec;
            if (from < 0) {
                from = 0;
            }
            if (to < 0 || to >= (int) vec.size()) {
                to = vec.size();
            }
            if (vec.CanGetRange(from, to)) {
                vec.GetSeqData(from, to, buffer);
            } else {
                m_FetchFailure = true;
            }
        }
    }
    catch (CException& e) {
        LOG_POST(Error << "Error in CBioseqIndex::GetSequence: " << e.what());
    }
}

string CBioseqIndex::GetSequence (int from, int to)

{
    string buffer;

    GetSequence(from, to, buffer);

    return buffer;
}

void CBioseqIndex::GetSequence (string& buffer)

{
    GetSequence(0, -1, buffer);
}

string CBioseqIndex::GetSequence (void)

{
    string buffer;

    GetSequence(0, -1, buffer);

    return buffer;
}

const vector<CRef<CGapIndex>>& CBioseqIndex::GetGapIndices(void)

{
    if (! m_GapsInitialized) {
        x_InitGaps();
    }

    return m_GapList;
}

const vector<CRef<CDescriptorIndex>>& CBioseqIndex::GetDescriptorIndices(void)

{
    if (! m_DescsInitialized) {
        x_InitDescs();
    }

    return m_SdxList;
}

const vector<CRef<CFeatureIndex>>& CBioseqIndex::GetFeatureIndices(void)

{
    if (! m_FeatsInitialized) {
        x_InitFeats();
    }

    return m_SfxList;
}


// CGapIndex

// Constructor
CGapIndex::CGapIndex (TSeqPos start,
                      TSeqPos end,
                      TSeqPos length,
                      const string& type,
                      const vector<string>& evidence,
                      bool isUnknownLength,
                      bool isAssemblyGap,
                      CBioseqIndex& bsx)
    : m_Start(start),
      m_End(end),
      m_Length(length),
      m_GapType(type),
      m_GapEvidence(evidence),
      m_IsUnknownLength(isUnknownLength),
      m_IsAssemblyGap(isAssemblyGap),
      m_Bsx(&bsx)
{
}


// CDescriptorIndex

// Constructor
CDescriptorIndex::CDescriptorIndex (const CSeqdesc& sd,
                                    CBioseqIndex& bsx)
    : m_Sd(sd),
      m_Bsx(&bsx)
{
    m_Type = m_Sd.Which();
}


// CFeatureIndex

// Constructor
CFeatureIndex::CFeatureIndex (CSeq_feat_Handle sfh,
                              const CMappedFeat mf,
                              CBioseqIndex& bsx)
    : m_Sfh(sfh),
      m_Mf(mf),
      m_Bsx(&bsx)
{
    const CSeqFeatData& data  = m_Mf.GetData();
    m_Type = data.Which();
    m_Subtype = data.GetSubtype();
    const CSeq_feat& mpd = m_Mf.GetMappedFeature();
    CConstRef<CSeq_loc> fl(&mpd.GetLocation());
    m_Fl = fl;
    m_Start = fl->GetStart(eExtreme_Positional);
    m_End = fl->GetStop(eExtreme_Positional);
}

// Find CFeatureIndex object for best gene using internal CFeatTree
CRef<CFeatureIndex> CFeatureIndex::GetBestGene (void)

{
    try {
        CMappedFeat best;
        CWeakRef<CBioseqIndex> bsx = GetBioseqIndex();
        auto bsxl = bsx.Lock();
        if (bsxl) {
            best = feature::GetBestGeneForFeat(m_Mf, &bsxl->GetFeatTree(), 0,
                                               feature::CFeatTree::eBestGene_AllowOverlapped);
            if (best) {
                return bsxl->GetFeatIndex(best);
            }
        }
    } catch (CException& e) {
        LOG_POST(Error << "Error in CFeatureIndex::GetBestGene: " << e.what());
    }
    return CRef<CFeatureIndex> ();
}

void CFeatureIndex::SetFetchFailure (bool fails)

{
    CWeakRef<CBioseqIndex> bsx = GetBioseqIndex();
    auto bsxl = bsx.Lock();
    if (bsxl) {
        bsxl->SetFetchFailure(fails);
    }
}

void CFeatureIndex::GetSequence (int from, int to, string& buffer)

{
    try {
        if (! m_SeqVec) {
            CWeakRef<CBioseqIndex> bsx = GetBioseqIndex();
            auto bsxl = bsx.Lock();
            if (bsxl) {
                CConstRef<CSeq_loc> lc = GetMappedLocation();
                if (lc) {
                    m_SeqVec = new CSeqVector(*lc, *bsxl->GetScope());
                    if (m_SeqVec) {
                        m_SeqVec->SetCoding(CBioseq_Handle::eCoding_Iupac);
                    }
                }
            }
        }

        if (m_SeqVec) {
            CSeqVector& vec = *m_SeqVec;
            if (from < 0) {
                from = 0;
            }
            if (to < 0 || to >= (int) vec.size()) {
                to = vec.size();
            }
            if (vec.CanGetRange(from, to)) {
                vec.GetSeqData(from, to, buffer);
            } else {
                SetFetchFailure(true);
            }
        }
    }
    catch (CException& e) {
        SetFetchFailure(true);
        LOG_POST(Error << "Error in CFeatureIndex::GetSequence: " << e.what());
    }
}

string CFeatureIndex::GetSequence (int from, int to)

{
    string buffer;

    GetSequence(from, to, buffer);

    return buffer;
}

void CFeatureIndex::GetSequence (string& buffer)

{
    GetSequence(0, -1, buffer);
}

string CFeatureIndex::GetSequence (void)

{
    string buffer;

    GetSequence(0, -1, buffer);

    return buffer;
}


// CWordPairIndexer

// superscript and subscript code points not handled by UTF8ToAsciiString
typedef SStaticPair        <utf8::TUnicode, char> TExtraTranslationPair;
typedef CStaticPairArrayMap<utf8::TUnicode, char> TExtraTranslations;
static const TExtraTranslationPair kExtraTranslations[] = {
    { 0x00B2, '2' },
    { 0x00B3, '3' },
    { 0x00B9, '1' },
    { 0x2070, '0' },
    { 0x2071, '1' },
    { 0x2074, '4' },
    { 0x2075, '5' },
    { 0x2076, '6' },
    { 0x2077, '7' },
    { 0x2078, '8' },
    { 0x2079, '9' },
    { 0x207A, '+' },
    { 0x207B, '-' },
    { 0x207C, '=' },
    { 0x207D, '(' },
    { 0x207E, ')' },
    { 0x207F, 'n' },
    { 0x2080, '0' },
    { 0x2081, '1' },
    { 0x2082, '2' },
    { 0x2083, '3' },
    { 0x2084, '4' },
    { 0x2085, '5' },
    { 0x2086, '6' },
    { 0x2087, '7' },
    { 0x2088, '8' },
    { 0x2089, '9' },
    { 0x208A, '+' },
    { 0x208B, '-' },
    { 0x208C, '=' },
    { 0x208D, '(' },
    { 0x208E, ')' }
};
DEFINE_STATIC_ARRAY_MAP(TExtraTranslations, sc_ExtraTranslations,
                        kExtraTranslations);

string CWordPairIndexer::ConvertUTF8ToAscii( const string& str )

{
    const char* src = str.c_str();
    string dst;
    while (*src) {
        if (static_cast<unsigned char>(*src) < 128) { // no translation needed
            dst += *src++;
        } else {
            utf8::TUnicode character;
            size_t n = utf8::UTF8ToUnicode(src, &character);
            src += n;
            TExtraTranslations::const_iterator it
                = sc_ExtraTranslations.find(character);
            if (it != sc_ExtraTranslations.end()) {
                dst += it->second;
            } else {
                const utf8::SUnicodeTranslation* translation =
                    utf8::UnicodeToAscii(character);
                if (translation != NULL  &&  translation->Type != utf8::eSkip) {
                    _ASSERT(translation->Type == utf8::eString);
                    if (translation->Subst != NULL) {
                        dst += translation->Subst;
                    }
                }
            }
        }
    }
    return dst;
}

static const char* const idxStopWords[] = {
    "+",
    "-",
    "a",
    "about",
    "again",
    "all",
    "almost",
    "also",
    "although",
    "always",
    "among",
    "an",
    "and",
    "another",
    "any",
    "are",
    "as",
    "at",
    "be",
    "because",
    "been",
    "before",
    "being",
    "between",
    "both",
    "but",
    "by",
    "can",
    "could",
    "did",
    "do",
    "does",
    "done",
    "due",
    "during",
    "each",
    "either",
    "enough",
    "especially",
    "etc",
    "for",
    "found",
    "from",
    "further",
    "had",
    "has",
    "have",
    "having",
    "here",
    "how",
    "however",
    "i",
    "if",
    "in",
    "into",
    "is",
    "it",
    "its",
    "itself",
    "just",
    "kg",
    "km",
    "made",
    "mainly",
    "make",
    "may",
    "mg",
    "might",
    "ml",
    "mm",
    "most",
    "mostly",
    "must",
    "nearly",
    "neither",
    "no",
    "nor",
    "obtained",
    "of",
    "often",
    "on",
    "our",
    "overall",
    "perhaps",
    "pmid",
    "quite",
    "rather",
    "really",
    "regarding",
    "seem",
    "seen",
    "several",
    "should",
    "show",
    "showed",
    "shown",
    "shows",
    "significantly",
    "since",
    "so",
    "some",
    "such",
    "than",
    "that",
    "the",
    "their",
    "theirs",
    "them",
    "then",
    "there",
    "therefore",
    "these",
    "they",
    "this",
    "those",
    "through",
    "thus",
    "to",
    "upon",
    "use",
    "used",
    "using",
    "various",
    "very",
    "was",
    "we",
    "were",
    "what",
    "when",
    "which",
    "while",
    "with",
    "within",
    "without",
    "would",
};
typedef CStaticArraySet<const char*, PCase_CStr> TStopWords;
DEFINE_STATIC_ARRAY_MAP(TStopWords, sc_StopWords, idxStopWords);

bool CWordPairIndexer::IsStopWord(const string& str)

{
    TStopWords::const_iterator iter = sc_StopWords.find(str.c_str());
    return (iter != sc_StopWords.end());
}

string CWordPairIndexer::TrimPunctuation (const string& str)

{
    string dst = str;

    int max = dst.length();

    for (; max > 0; max--) {
        char ch = dst[0];
        if (ch != '.' && ch != ',' && ch != ':' && ch != ';') {
            break;
        }
        // trim leading period, comma, colon, and semicolon
        dst.erase(0, 1);
    }

    for (; max > 0; max--) {
        char ch = dst[max-1];
        if (ch != '.' && ch != ',' && ch != ':' && ch != ';') {
            break;
        }
        // // trim trailing period, comma, colon, and semicolon
        dst.erase(max-1, 1);
    }

    if (max > 1) {
        if (dst[0] == '(' && dst[max-1] == ')') {
            // trim flanking parentheses
            dst.erase(max-1, 1);
            dst.erase(0, 1);
            max -= 2;
        }
    }

    if (max > 0) {
        if (dst[0] == '(' && NStr::Find (dst, ")") == NPOS) {
            // trim isolated left parentheses
            dst.erase(0, 1);
            max--;
        }
    }

    if (max > 1) {
        if (dst[max-1] == ')' && NStr::Find (dst, "(") == NPOS) {
            // trim isolated right parentheses
            dst.erase(max-1, 1);
            // max--;
        }
    }

    return dst;
}

static const char* const mixedTags[] = {
    "<b>",
    "<i>",
    "<u>",
    "<sup>",
    "<sub>",
    "</b>",
    "</i>",
    "</u>",
    "</sup>",
    "</sub>",
    "<b/>",
    "<i/>",
    "<u/>",
    "<sup/>",
    "<sub/>",
    "&lt;i&gt;",
    "&lt;/i&gt;",
    "&lt;i/&gt;",
    "&lt;b&gt;",
    "&lt;/b&gt;",
    "&lt;b/&gt;",
    "&lt;u&gt;",
    "&lt;/u&gt;",
    "&lt;u/&gt;",
    "&lt;sub&gt;",
    "&lt;/sub&gt;",
    "&lt;sub/&gt;",
    "&lt;sup&gt;",
    "&lt;/sup&gt;",
    "&lt;sup/&gt;",
    "&amp;lt;i&amp;gt;",
    "&amp;lt;/i&amp;gt;",
    "&amp;lt;i/&amp;gt;",
    "&amp;lt;b&amp;gt;",
    "&amp;lt;/b&amp;gt;",
    "&amp;lt;b/&amp;gt;",
    "&amp;lt;u&amp;gt;",
    "&amp;lt;/u&amp;gt;",
    "&amp;lt;u/&amp;gt;",
    "&amp;lt;sub&amp;gt;",
    "&amp;lt;/sub&amp;gt;",
    "&amp;lt;sub/&amp;gt;",
    "&amp;lt;sup&amp;gt;",
    "&amp;lt;/sup&amp;gt;",
    "&amp;lt;sup/&amp;gt;",
};

static int SkipMixedContent ( const char* ptr )

{
    for (int i = 0; i < sizeof (mixedTags); i++) {
        const char* tag = mixedTags[i];
        const char* tmp = ptr;
        int len = 0;
        while (*tag && *tmp && *tag == *tmp) {
            tag++;
            tmp++;
            len++;
        }
        if (! *tag) {
            return len;
        }
    }
    return 0;
}

string CWordPairIndexer::TrimMixedContent ( const string& str )

{
    const char* src = str.c_str();
    string dst;
    while (*src) {
        if (*src == '<' || *src == '&') {
            int skip = SkipMixedContent (src);
            if (skip > 0) {
                src += skip;
            } else {
                dst += *src++;
            }
        } else {
            dst += *src++;
        }
    }
    return dst;
}

string CWordPairIndexer::x_AddToWordPairIndex (string item, string prev)

{
    if (IsStopWord(item)) {
        return "";
    }
    // append item
    m_Norm.push_back(item);
    if (! prev.empty()) {
        // append prev+" "+item
        string pair = prev + " " + item;
        m_Pair.push_back(pair);
    }
    return item;
}

void CWordPairIndexer::PopulateWordPairIndex (string str)

{
    m_Norm.clear();
    m_Pair.clear();

    str = ConvertUTF8ToAscii(str);
    NStr::ToLower(str);

    if (NStr::Find(str, "<") != NPOS || NStr::Find(str, "&") != NPOS) {
        str = TrimMixedContent(str);
    }

    // split terms at spaces
    list<string> terms;
    NStr::Split( str, " ", terms, NStr::fSplit_Tokenize );
    string prev = "";
    ITERATE( list<string>, it, terms ) {
        string curr = NStr::TruncateSpaces( *it );
        // allow parentheses in chemical formula
        curr = TrimPunctuation(curr);
        prev = x_AddToWordPairIndex (curr, prev);
    }

    // convert non-alphanumeric punctuation to space
    for (int i = 0; i < str.length(); i++) {
        char ch = str[i];
        if (ch >= 'A' && ch <= 'Z') {
        } else if (ch >= 'a' && ch <= 'z') {
        } else if (ch >= '0' && ch <= '9') {
        } else {
            str[i] = ' ';
        }
    }
    // now splitting at all punctuation
    list<string> words;
    NStr::Split( str, " ", words, NStr::fSplit_Tokenize );
    prev = "";
    ITERATE( list<string>, it, words ) {
        string curr = NStr::TruncateSpaces( *it );
        prev = x_AddToWordPairIndex (curr, prev);
    }

    std::sort(m_Norm.begin(), m_Norm.end());
    auto nit = std::unique(m_Norm.begin(), m_Norm.end());
    m_Norm.erase(nit, m_Norm.end());

    std::sort(m_Pair.begin(), m_Pair.end());
    auto pit = std::unique(m_Pair.begin(), m_Pair.end());
    m_Pair.erase(pit, m_Pair.end());
}


END_SCOPE(objects)
END_NCBI_SCOPE
