#!/bin/sh

for fl in *.xml.gz
do
  echo "$fl"
  base=${fl%.xml.gz}
  gunzip -c "$fl" | xtract -strict -compress -format flush > "$base.tmp"
  xtract -input "$base.tmp" -pattern PubmedArticle -element MedlineCitation/PMID > "$base.uid"
  rchive -input "$base.tmp" -unique "$base.uid" -index MedlineCitation/PMID \
    -head "<PubmedArticleSet>" -tail "</PubmedArticleSet>" -pattern PubmedArticle |
  xtract -format indent -xml '<?xml version="1.0" encoding="UTF-8"?>' \
    -doctype '<!DOCTYPE PubmedArticleSet SYSTEM "https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_180601.dtd">' > "$base.xml"
  rm "$base.tmp"
  rm "$base.uid"
done
rm *.xml.gz
