Académique Documents
Professionnel Documents
Culture Documents
/bin/bash
# article: a bash script to download a paper indexed by arXiv, Mathematical
Reviews, Zentralblatt MATH, CrossRef, CiteSeerX, Project Euclid
# and save it under a file name like smith-brown-an-example-of-a-model-
category.pdf.
# See the source code for the list of supported HTTP URLs.
# Copyright 2014, 2015 Dmitri Pavlov. Distributed under the terms of the GNU
Affero General Public License version 3.
version=2017-08-18
set -u -e -o pipefail
echoerr() {
>&2 echo "$@"
}
fatal() {
echoerr "$@"
exit 1
}
echon() {
if [[ ! -v quiet ]]; then echoerr "$@"; fi
}
echov() {
if [[ -v verbose ]]; then echoerr "$@"; fi
}
syn() {
fatal "Synopsis: $0" '[ options ] [ MR[0-9]* | Zbl:[0-9]*.[0-9]* | arXiv:[0-9]*.
[0-9]* | arXiv:[-a-z]*/[0-9]* | doi:.* | cs:[0-9.]* | http://.* | https://.* ]' "
-d directory: directory where to save the file
-q: quiet, do not output any diagnostics
-v: verbose, print additional diagnostics
-i: offer an interactive choice of a full-text URL to download
-u url: use url as a full text URL
-f: fancy file names
-n: dry run: print the final file name and the full text URLs, but do not
download anything
-a: extract and print abstract page URLs, do not download anything
-e command arguments --: execute a command after a successful download
-p pairing: use an AMS pairing key to access MathSciNet"
}
urls=()
echon "Scientific article full text downloader by Dmitri Pavlov, version $version."
echon "To report bugs and missing features, please email me (host
math.berkeley.edu, user pavlov).
Please include the command line and the output of the script when run with the -v
option in your email.
Before submitting a bug report, please make sure that you can download the full
text using your browser;
the inability of the script to download the full text is often an indication that
you don't have a subscription.
"
texsimp() {
texuni | uconv -x any-nfc
}
texstrip() {
sed 's/\\[a-zA-Z]*//g;s/[$^_{}]//g'
}
texnorm() {
texsimp | texstrip
}
fetch() {
echov Fetching "$@"
cline=(curl -s -S -f)
if [[ -v amspairing ]]; then
case "${@:$#}" in
*.ams.org/*)
cline+=("-b" "amspairing=$amspairing") ;;
esac
fi
"${cline[@]}" "$@"
}
fetchc() {
fetch -b /dev/null "$@"
}
fetchr() {
fetch -L "$@"
}
fetchcr() {
fetchc -L "$@"
}
sfetch() {
fetch -o /dev/null -I -w "%{redirect_url}\n" "$@"
}
fetchz() {
data="$(fetch "$@")"
while [[ $data == *captcha* ]]; do
echon zbMATH demands a CAPTCHA, which means that no subscription is available.
Manual entry.
id="$(printf '%s\n' "$data" | grep captcha_id | sed 's/.*value="\
([^"]*\)".*/\1/')"
show "https://zbmath.org/captcha/$id"
echo Enter zbMATH CAPTCHA:
read -r captcha
echov Entered CAPTCHA: "$captcha"
data="$(fetch -F captcha_id="$id" -F captcha_solution="$captcha" "$@")"
done
printf '%s\n' "$data"
}
retft() {
local pdf="$1"
echon Attempting to retrieve the full text using URL "$pdf"
local tname="$2"
local jar="/dev/null"
local -a addopts=("-L")
case "$pdf" in
http://www.jstor.org/*)
jar="$(mktemp /tmp/article-XXX)"
echov Special cookie treatment for JSTOR
echov Cookie jar file: "$jar"
fetchcr -c "$jar" -I "$pdf"
pdf="$pdf"'?acceptTC=true' ;;
http*://projecteuclid.org/*)
echov Publisher: Project Euclid
echov Adding referrer
addopts+=("-e" "https://projecteuclid.org") ;;
http*://*cms.math.ca/*)
echov Publisher: CMS
echov Adding referrer
addopts+=("-e" "https://cms.math.ca/") ;;
http://libgen.*)
echov Repository: Library Genesis
echov Adding referrer
addopts+=("-e" "http://libgen.io/scimag/ads.php") ;;
http://booksc.*)
echov Repository: Library Genesis BookSC
echov Adding referrer
addopts+=("-e" "http://booksc.org") ;;
http://*sciencedirect*)
echov Repository: Elsevier
echov Adding user agent
addopts+=("-A" "Mozilla") ;;
http*://*ams.org/*)
echov Publisher: AMS
if [[ -v amspairing ]]; then
echov Adding pairing key
jar="amspairing=$amspairing"
else
echov No pairing key specified
fi ;;
esac
case "$code" in
200) ;;
401)
echon "HTTP code 401 (Unauthorized) most likely means that you have no
subscription to this resource."
echon "Check whether you have a subscription; if you can successfully download
the full text file, please email me."
return 1 ;;
404)
echon "HTTP code 404 (Not Found) usually means that there is a bug in the
script, unless downloading from one of Library Genesis mirrors."
echon "Please email me the command line and the output of the script so that I
can fix the bug."
return 1 ;;
*)
echon "Error: HTTP code is $code, not 200, downloaded file ignored."
echon "Check whether you have a subscription; if you can successfully download
the full text file, please email me."
return 1 ;;
esac
case "$type" in
application/pdf) echon PDF file; dextn="pdf" ;;
application/pdf*) echon PDF file from JSTOR; dextn="pdf" ;; # JSTOR server is run
by incompetent people
text/pdf) echon PDF file from CUP; dextn="pdf" ;; # CUP server is also run by
incompetent people
application/postscript) echon PostScript; dextn="ps" ;;
image/vnd.djvu) echon DjVu; dextn="djvu" ;;
application/x-dvi) echon DVI; dextn="dvi" ;;
application/x-tar) echon DVI in a TAR file; dextn="tar" ;;
application/octet-stream|application/download*)
if [[ -v extn ]]; then
echon File from Library Genesis, extension "$extn"
dextn="$extn"
else
echon "No extension supplied for application/octet-stream. Report this error
to me by email."
return 1
fi ;;
*)
echon "Unrecognized Content-Type: not PDF, PostScript, DjVu, or DVI, downloaded
file ignored."
echon "This might mean that you have no subscription to this content, because
many scientific repositories use a brain-damaged way to report authorization
errors."
echon "If you do have a subscription and can successfully download and view the
full text file, please email me."
return 1 ;;
esac
echon Extension: "$dextn"
}
declare -A xtried
fturl() {
if [[ -v abstract ]]; then
echo "$1"
return
fi
local url="$1"
echon Abstract page URL: "$url"
set +e # allow curl to fail so that doi links still get processed
meta="$(fetchcr -I -A / -w "%{content_type}\n%{url_effective}\n" "$url")"
set -e
ctype="$(echo "$meta" | tail -2 | head -1)"
echon Content-Type: "$ctype"
crurl="$(echo "$meta" | tail -1)"
echon Completely resolved URL: "$crurl"
if [[ "$ctype" == "application/pdf" ]]; then
urls+=("$crurl")
echov "Content-Type is application/pdf; added direct PDF URL $crurl"
data=""
else
set +e
data="$(fetchcr -A / "$url")"
set -e
fi
if [[ -v auti ]]; then if echo "$data" | grep -Ec 'doi.org/|"doi"' >/dev/null;
then
dois="$(echo "$data" | sed -n 's@.*doi.org/\([^ "<]*\).*@\1@p;s/"doi":"\
([^"]*\)"/\1/p')"
echov Extracted DOIs:
echov "$dois"
doi "$(echo "$dois" | head -1)"
return
fi; fi
if echo "$data" | grep -c citation_pdf_url >/dev/null; then
echon 'Generic method (citation_pdf_url), e.g., AMS, EMS, Project Euclid, CUP,
OUP, Springer, de Gruyter, Wiley'
local pdf
pdf="$(echo "$data" | tr \\n \ | sed -n 's@.*\
(<[^>]*citation_pdf_url[^>]*>\).*@\1@p' | sed -n 's@.*content=[^"]*"\
([^"]*\)".*@\1@p')"
echon citation_pdf_url: "$pdf"
case "$pdf" in
http://onlinelibrary.wiley.com/doi/*)
echov Publisher: Wiley
pdf="$(fetch "$pdf" | sed -n 's@.*id="pdfDocument" src="\([^"]*\)".*@\1@p' |
sed 's/&/\&/g')"
if [[ -z "$pdf" ]]; then
echon 'No Wiley PDF URL found; possible cause: no subscription'
unset pdf
else
echov Adjusted Wiley PDF URL: "$pdf"
fi ;;
http://journals.cambridge.org/*)
echov Publisher: CUP
pdf="$(sfetch "$pdf")""&toPdf=true"
echov Adjusted CUP PDF URL: "$pdf" ;;
http://*)
echov Generic HTTP URL ;;
https://*)
echov Generic HTTPS URL ;;
*)
echov Generic relative URL
urlbase="$(sfetch "$url" | sed -n 's@^\(http.*//[^/]*\).*@\1@p')"
echov Base "$urlbase"
pdf="$urlbase$pdf"
echov Adjusted URL: "$pdf" ;;
esac
case "$data" in
*"Duke Mathematical Journal"*)
echov Switching to the nonenhanced PDF for the Duke Mathematical Journal due
to an insane color scheme
pdf="${pdf//pdfview/pdf}" ;;
esac
if [[ -v pdf ]]; then
urls+=("$pdf")
echon Added citation_pdf_url "$pdf"
fi
fi
if [[ -v auti ]]; then
echov Attempting to extract title and authors from the HTML
data="$(echo "$data" | iconv -f "$(echo "$data" | file -b --mime-encoding -)"
-t utf-8 -c)"
if echo "$data" | grep -c citation_title >/dev/null; then
title="$(echo "$data" | tr \\n \ | sed -n 's@.*\
(<[^>]*citation_title[^>]*>\).*@\1@p' | tee | sed -n 's@.*content=[^"]*"\
([^"]*\)".*@\1@p')"
echov citation_title: "$title"
fi
if echo "$data" | grep -c citation_author >/dev/null; then
authors="$(echo "$data" | sed -n 's@.*\(<[^>]*citation_author[^>]*>\).*@\1@p'
| sed -n 's@.*content=[^"]*"\([^"]*\)".*@\1@p')"
echov citation_author: "$authors"
if echo "$authors" | grep ,; then
authors="$(echo "$authors" | sed 's/,.*//')"
echov Author last names before commas: "$authors"
else
authors="$(echo "$authors" | sed 's/.* \([^ ]*\)/\1/')"
echov Author last names: "$authors"
fi
fi
fi
local pdf
case "$url" in
http://*doi.org/* | https://*doi.org/*)
doi="${url##*doi.org/}"
echon DOI: "$doi"
echov DOI URL: "$url"
rurl="$(sfetch "$url")"
echon Resolved DOI: "$rurl" ;;
*)
rurl="$url" ;;
esac
url="$rurl"
case "$rurl" in
http://mr.crossref.org/*)
echon CrossRef fork
links="$(fetch "$rurl" | { grep -o "href=['\"][^'\"]*['\"]" || true; } | sed
's/href=.//;s/.$//' | { grep -Ev '^https?://.*doi.org' || true; } | uniq)"
echon Detected links:
echon "$links"
for i in $links; do
echon Recursively processing link "$i"
fturl "$i"
done ;;
http://linkinghub.elsevier.com/retrieve/pii/*)
echov Publisher: Elsevier
eid="${rurl:44}"
echov Elsevier ID: "$eid"
#pdf="$(echo "$data" | sed -n 's/.*pdfurl="\([^"]*\)".*/\1/p')"
eapdata="$(fetch -A / "http://www.sciencedirect.com/science/article/pii/$eid")"
#pdf="$(echo "$eapdata" | sed -n 's/.*pdfurl="\([^"]*\)".*/\1/p')"
pdf="$(echo "$eapdata" | sed -n 's@.*<a class="pdf-download-btn-link" href="\
([^"]*\)".*@https://www.sciencedirect.com\1@p' | sed 's/&/\&/g')"
if [[ -z "$pdf" ]]; then
echon 'No Elsevier PDF URL found.'
pdf="$(echo "$eapdata" | tr \\n @ | sed -n 's/.*pdf-download-link"@ *href="\
([^"]*\)".*/\1/p')"
if [[ -z "$pdf" ]]; then
echon 'No Elsevier pdf-download-link found.'
echon 'Possible cause: no subscription.'
echon 'If you do have subscription, please email me.'
unset pdf
fi
fi
if [[ -v pdf && $pdf == //* ]]; then
pdf="http:$pdf"
echov Adjusting // Elsevier URL to "$pdf"
fi ;;
http://www.tandfonline.com/doi/abs/*)
echov Publisher: Taylor and Francis
pdf="${rurl//abs/pdf}" ;;
http://www.worldscientific.com/doi/abs/*)
echov Publisher: World Scientific
pdf="${rurl//abs/pdfplus}" ;;
http://epubs.siam.org/doi/abs/*)
echov Publisher: SIAM
pdf="${rurl//abs/pdf}" ;;
http://www.msp.warwick.ac.uk/*)
echov Publisher: MSP
pdf=http://www.msp.warwick.ac.uk"$(echo "$data" | sed -n 's/.*"\
([^"]*[sp]\.pdf\)".*/\1/p' | head -1)" ;;
http://*journalofsing.org/*)
echov Publisher: Journal of Singularities
pdf="${rurl%/*}/$(fetch "$rurl" | sed -n 's@.*href="\([^"]*pdf\)"
style.*@\1@p')" ;;
http://www.jstor.org/*)
echov Publisher: JSTOR
pdf=http://www.jstor.org/stable/pdfplus/${doi:8}.pdf ;;
http://www.cms.math.ca/*)
echov Publisher: CMS
pdf="http://cms.math.ca$(echo "$data" | sed -n 's@.*<a href="\([^"]*\)">Read
article</a>.*@\1@p')" ;;
http://www.intlpress.com/*)
echov Publisher: International Press
pdf="$rurl"$(fetch "$rurl/body.html" | sed -n 's@.*"\([^"]*.pdf\)".*@\1@p') ;;
http://*.impan.pl/cgi-bin/doi*)
echov Publisher: IMPAN
pdf="${rurl//\/doi/\/pdf}"
if [[ "${pdf: -2:1}" == "-" ]]; then
pdf="${pdf:0:-2}-0${pdf: -1:1}"
fi ;;
http://retro.seals.ch/digbib/view?rid=*)
echov Publisher: retro.seals
pdf="${rurl//digbib\/view?rid=/cntmng?pid=}" ;;
# end of DOI URLs
http://www.numdam.org/item?id=*)
echov Publisher: Numdam
numdam="${url:30}"
echov Numdam ID: "$numdam"
pdf="http://archive.numdam.org/article/$numdam.pdf" ;;
http://*.cedram.org/item?id=*)
echov Publisher: Cedram
numdam="${url#*item?id=}"
echov Numdam ID: "$numdam"
pdf="http://archive.numdam.org/article/$numdam.pdf" ;;
http*://eudml.org/*)
echov Publisher: EuDML
pdf="$(echo "$data" | sed -n "s@.*<a href='\\([^']*\\)'"' title=""
target="_blank" rel="nofollow">Full (PDF)</a>.*@\1@p')"
if [ -z "$pdf" ]; then
link="$(echo "$data" | sed -n "s@.*<a href='\\([^']*\\)'"' title=""
target="_blank" rel="nofollow">Access to full text</a>.*@\1@p')"
echov Intermediate link: "$link"
fturl "$link"
fi ;;
http://muse.jhu.edu/*.pdf)
echov Publisher: MUSE
pdf="$url" ;;
http://www.emis.de/*abs.html)
echov Publisher: EMIS
pdf="${url//abs.html/.pdf}" ;;
http://www.emis.de/*.html)
echov Publisher: EMIS
pdf="${url//html/pdf}" ;;
http://www.digizeitschriften.de/dms/*)
echov Publisher: DigiZeitschriften
link="${url//resolveppn/img}"
echov PDF page for DigiZeitschriften: "$link"
data="$(fetch "$link")"
pdf="$(echo "$data" | sed -n 's@.*<a href="\([^"]*\)"
class="maintitle_pdf">.*@\1@p')" ;;
http://gdz.sub.uni-goettingen.de/dms/resolveppn/*)
echov Publisher: GDZ "$url"
data="$(fetchr "$url")"
pdf="$(echo "$data" | sed -n 's@.*<a href="\([^"]*\)"
class="maintitle_pdf">.*@\1@p')" ;;
http://*tac.mta.ca*)
echov Publisher: TAC
pdf="${url//abs.html/.pdf}" ;;
http://www.pnas.org/cgi/doi/*)
echov Publisher: PNAS
pdf="$crurl" ;;
http://tcms.org.ge/*)
echov Publisher: TCMS
volume="$(echo "$data" | sed -n 's@.*Vol. \([^(]*\)(.*@\1@p')"
echov Volume: "$volume"
trim="${url%/abstract.htm}"
echov Trimmed URL: "$trim"
stem="${trim##*/}"
echov URL stem: "$stem"
pdf="${trim//volumes/xvolumes}/v${volume}${stem}hl.pdf" ;;
http://*mathematik.uni-bielefeld.de/documenta/*)
echov Publisher: Documenta Mathematica
pdf="${url//html/pdf}" ;;
http://d-nb.info/*)
echov Publisher: DNB
pdf="$url" ;;
*)
echov Unknown URL "$url"
echov If the script is unable to download the full text, please email me so
that I can add support for this type of URL. ;;
esac
if [[ -v pdf ]]; then
echon Added publisher URL "$pdf"
urls+=("$pdf")
fi
url="$1"
case "$url" in
http://*doi.org/* | https://*doi.org*)
doi="${url##*doi.org/}"
mapfile -t arxivurls < <(fetch http://export.arxiv.org/api/query --data-
urlencode "search_query=doi:\"$doi\"" | xidel - -s -e
"//feed/entry/link[@title='pdf']/@href" | sed '/^$/d')
if [[ -v arxivurls ]]; then
echon Added arXiv URLs obtained by DOI:
echon "${arxivurls[@]}"
urls+=("${arxivurls[@]}")
fi
case "$crurl" in
http://link.springer.com/book/*)
echon 'Publisher: Springer (book)'
echon Springer books are typically split into many individual files, which does
not fit the operational model of this script. Aborting.
return ;;
esac
echon Attempting to extract raw URLs from the abstract page "$crurl":
mapfile -t newurls < <(echo "$data" | xidel - -s -e "(//@href, //@src)/resolve-
uri(.,\"$crurl\")" | sed 's/#.*//' | grep pdf | grep -v "degruyter.com/flyer/\|
degruyter.com/.*.toc.xml\|degruyter.com/.*.fm.xml\|ams.org/publications/\|
ams.org/firefox\|endmatter\|msp.org/forms\|math.ca/Membership\|math.ca/Docs\|
math.ca/.*/abstract/\|pdf-preview\|/marketing/\|\.gif$")
for i in "${!newurls[@]}"; do
if [[ ! ${xtried["${newurls["$i"]}"]+_} ]]; then
vnewurls+=("${newurls[$i]}")
xtried["${newurls["$i"]}"]=1
fi
done
if [[ -v vnewurls ]]; then
echon Last resort: some more PDF urls from the abstract page:
echon "${vnewurls[@]}"
urls+=("${vnewurls[@]}")
echon Warning: some publishers link irrelevant PDFs from the abstract page,
e.g., license agreements, abstracts, etc.
echon If the script ends up downloading such an irrelevant PDF, please email me
so that I can add it to the list of exceptions.
fi
}
arXiv() {
echon Database: arXiv "$1"
data="$(fetch http://export.arxiv.org/api/query?id_list="$1" | tr \\n \ | sed
's@<[^/][^>]*/>@\n&\n@g;s@<[^/][^>]*[^/>]>@\n&@g;s@</[^>]*>@&\n@g')"
echov Processed output:
echov "$data"
arxiverr="$(echo "$data" | sed -n
'\@^<id>http://arxiv.org/api/errors.*</id>$@{p;q1}')"
if [[ ! -z "$arxiverr" ]]; then echon "$arxiverr"; fi
id="$(echo "$data" | sed -n 's@^<id>http://arxiv.org/abs/\(.*\)</id>$@\1@p')"
echov arXiv ID: "$id"
title="$(echo "$data" | sed -n 's@^<title>\(.*\)</title>$@\1@p')"
authors="$(echo "$data" | sed -n 's@^<name>\(.*\)</name>$@\1@p' | sed 's/.* \
([^ ]*\)/\1/')"
urls+=("http://arxiv.org/pdf/$id.pdf")
}
msn() {
echon Database: MathSciNet "$1"
data="$(fetch http://www.ams.org/mathscinet/search/publications.html?
fmt=endnote\&pg1=MR\&s1=MR"$1" | sed -n '1,/<pre>/d;/<\/pre>/,$d;p')"
echov EndNote:
echov "$data"
#data="$(echo "$data" | tr \\n \\t | sed 's/\t //g' | tr \\t \\n)"
title="$(echo "$data" | tr \\n @ | sed -n 's/.*@%T \([^%]*\)@ *%.*/\1/p' | tr @ \
| texnorm)"
echov Title: "$title"
authors="$(echo "$data" | sed -n 's/^%A //p' | sed 's/\([^,]*\),.*/\1/' |
texnorm)"
echov Authors: "$authors"
local url
url="$(echo "$data" | sed -n 's/^%U //p')"
if [ -z "$url" ]; then
echov No URL found in EndNote data, attempting to extract a URL from the HTML
file
hdata="$(fetch http://www.ams.org/mathscinet-getitem?mr=MR"$1" | { grep -A 31
"<strong>MR0*$1</strong>" || true; })"
echov Processed output:
echov "$hdata"
#authors="$(echo "$hdata" | sed 's/<a href/\n<a href/g;s@</a>@</a>\n@g' | sed
-n 's@<a href="/mathscinet/search/author.html?mrauthid=[^"]*">\([^<]*\)</a>@\1@p' |
sed 's/\([^,]*\),.*/\1/')"
#title="$(echo "$hdata" | tr \\n \ | sed -n 's@.*<span class="title">\
([^<]*\)</span>.*@\1@p')"
zbl() {
echon Database: zbMATH "$1"
data="$(fetchz https://zbmath.org/?q=an:"$1")"
#authors="$(echo "$data" | sed 's/<a href/\n<a href/g;s@</a>@</a>\n@g' | sed -n
's@<a href="authors/?q=ai:[^"]*" title="Author Profile">\([^<]*\)</a>@\1@p' | sed
's/\([^,]*\),.*/\1/')"
#title="$(echo "$data" | sed -n 's@.*<div class="title">\([^<]*\)<.*@\1@p')"
url="$(echo "$data" | sed -n 's@.*<a class="btn btn-default btn-xs" type="button"
href="\([^"]*\)".*@\1@p')"
#bibtexurl="$(echo "$data" | sed -n 's@.*<a class="btn btn-mini bib" data-
container="body" type="button" href="\([^"]*\)".*@\1@p')"
bibtexurl="https://zbmath.org/bibtex/$1.bib"
echov zbMATH BibTeX URL: "$bibtexurl"
data="$(fetchz "$bibtexurl")"
echov zbMATH BibTeX:
echov "$data"
authors="$(echo "$data" | sed -n 's@^ Author = {\(.*\)},$@ \1 @p' | grep -Eo '
{([^{}]|({[^}]*}))*} ' | sed 's/^..//;s/..$//' | texsimp)"
echov BibTeX authors: "$authors"
title="$(echo "$data" | sed -n 's@^ Title = {{\(.*\)}},$@\1@p' | texnorm)"
echov BibTeX title: "$title"
if [ -z "$url" ]; then
echon 'No full text URL supplied by zbMATH. Try MathSciNet, sometimes it gives
a full text URL when zbMATH does not.'
return
fi
while read -r iurl; do
echon Trying zbMATH abstract page URL "$iurl"
fturl "$iurl"
done <<< "$url"
}
urldecode() {
a="$(cat)"
printf '%b' "${a//%/\\x}"
}
doi() {
echon Database: CrossRef "$1"
data="$(fetch "http://search.crossref.org/dois?q=$1")"
echov CrossRef data: "$data"
title="$(echo "$data" | sed -n 's@ "title": "\(.*\)",@\1@p')"
echov CrossRef title: "$title"
authors="$(echo "$data" | sed 's@&@\n@g;s@",$@@' | sed -n
's/rft.au=+//p;s/rft.au=//p' | tr + \ | urldecode)"
echov CrossRef authors field: "$authors"
if echo "$authors" | grep '&'; then
authors="$(echo "$authors" | sed 's/&/\n/g' | sed 's/^ *//;s/ *$//')"
else
authors="$(echo "$authors" | sed 's/.* \([^ ]*\)/\1/')"
fi
echov CrossRef authors: "$authors"
fturl "https://doi.org/$1"
}
cs() {
echon Database: CiteSeerX "$1"
data="$(fetch "http://citeseerx.ist.psu.edu/oai2?
verb=GetRecord&metadataPrefix=oai_dc&identifier=oai:CiteSeerX.psu:$1")"
title="$(echo "$data" | sed -n 's@.*<dc:title>\(.*\)</dc:title>.*@\1@p')"
authors="$(echo "$data" | sed -n 's@.*<dc:creator>\(.*\)</dc:creator>.*@\1@gp' |
sed 's/.* \([^ ]*\)/\1/')"
urls+=("http://citeseerx.ist.psu.edu/viewdoc/download?doi=$1&rep=rep1&type=pdf")
}
pe() {
echon Database: Project Euclid "$1"
data="$(fetch http://projecteuclid.org/export_citations --data format=ris --data-
urlencode "h=$1")"
echov Project Euclid bibliographic data:
echov "$data"
title="$(echo "$data" | sed -n 's/TI - //p')"
authors="$(echo "$data" | sed -n 's/AU - //p' | sed 's/\([^ ]*\).*/\1/')"
urls+=("http://projecteuclid.org/download/pdf_1/$1")
}
eudml() {
echon Database: EuDML "$1"
data="$(fetch "https://eudml.org/api/rest/urn:$1?format=oai_dc")"
echov EuDML data:
echov "$data"
title="$(echo "$data" | sed -n 's|.*>\([^>]*\)</dc:title>.*|\1|p' | head -1)"
authors="$(echo "$data" | sed -n 's|.*>\([^>]*\)</dc:creator>.*|\1|p')"
if echo "$authors" | grep -c , >/dev/null; then
authors="$(echo "$authors" | sed 's/\([^ ,]*\).*/\1/')"
else
authors="$(echo "$authors" | sed 's/.* \([^ ]*\)/\1/')"
fi
fturl "https://eudml.org/doc/${1##eudml:doc:}"
}
gen() {
echon Database: Library Genesis "$1"
data="$(fetch "http://libgen.io/book/index.php?md5=$1")"
isbn=($(echo "$data" | sed -n 's@.*ISBN:</font></td><td>\([^<]*\)</td>.*@\1@p' |
tr -cs 0-9- \ ))
if [[ ${#isbn[@]} -ne 0 ]]; then
echov ISBNs from Library Genesis: "${isbn[@]}"
fi
title="$(echo "$data" | sed -n 's@.*Title: </font></nobr></td><td colspan=2><b><a
href=[^>]*>\([^<]*\)</a>.*@\1@p')"
authors="$(echo "$data" | sed -n 's@.*Author(s):</font></nobr></td><td
colspan=3><b>\([^<]*\)</b>.*@\1@p' | sed 's/(auth.)//g' | sed 's/, /\n/g' | sed
's/^ *//;s/ *$//' | sed -n 's/.* \([^ ]*\)/\1/p')"
for i in "${!isbn[@]}"; do
echov Trying ISBN "${isbn[$i]}" with WorldCat
wdata="$(fetch "http://xisbn.worldcat.org/webservices/xid/isbn/${isbn[$i]}?
method=getMetadata&format=json&fl=*")"
echov ISBN bibliographic data from WorldCat: "$wdata"
if [[ "ok" != "$(echo "$wdata" | jq -r .stat)" ]]; then
continue
fi
if ! authors="$(echo "$wdata" | jq -e -r '.list[0].author' | sed 's/\.
$//;s/ ; /\n/g;s/ and /\n/g;s/, /\n/g' | sed 's/.* \([^ ]*\)/\1/')"; then
oclc="$(echo "$wdata" | jq -e -r '.list[0].oclcnum[0]')"
echov OCLC number: "$oclc"
wwdata="$(fetchr "http://www.worldcat.org/oclc/$oclc?
page=endnote&client=worldcat.org-detailed_record")"
echov EndNote bibliographic data from WorldCat: "$wwdata"
authors="$(echo "$wwdata" | sed -n 's/^AU - //p' | sed 's/\(.*\),.*/\1/')"
fi
echov Authors from WorldCat: "$authors"
title="$(echo "$wdata" | jq -r .list[0].title)"
echov Title from WorldCat: "$title"
if [[ -n "$authors" && -n "$title" ]]; then
break
fi
done
if [[ -z "$authors" || -z "$title" ]]; then
for i in "${!isbn[@]}"; do
echov Trying ISBN "${isbn[$i]}" with Google Books
bdata="$(fetch "https://www.googleapis.com/books/v1/volumes?q=isbn+$
{isbn[$i]}&fields=items/volumeInfo(title,authors)&maxResults=1")"
echov ISBN bibliographic data from Google Books: "$bdata"
if [[ "$bdata" = "{}" ]]; then
continue
fi
authors="$(echo "$bdata" | jq -r .items[0].volumeInfo.authors[] | sed 's/.* \
([^ ]*\)/\1/')"
echov Authors from Google Books: "$authors"
title="$(echo "$bdata" | jq -r .items[0].volumeInfo.title)"
echov Title from Google Books: "$title"
if [[ -n "$authors" && -n "$title" ]]; then
break
fi
done
fi
extn="$(echo "$data" | sed -n 's@.*Extension:</font></nobr></td><td>\
([^<]*\)</td>.*@\1@p')"
id="$(echo "$data" | sed -n 's@.*ID:</font></nobr></td><td>\
([^<]*\)</td>.*@\1@p')"
echov Library Genesis extension: "$extn"
echov Library Genesis ID: "$id"
urls+=("http://libgen.me/noleech1.php?hidden=${id::-3}000//${1,,}") # ${id::-
3}000/
urls+=("http://dl.b-ok.org/genesis/${id::-3}000/${1,,}/_as/") # ${id::-3}000/
urls+=("http://dlx.b-ok.org/genesis/${id::-3}000/${1,,}/_as/")
urls+=("http://libgen.io/get/$1/name.pdf")
urls+=("http://userbooks.bookfi.org/2/$1")
}
case "$artid" in
http*://*arxiv.org/abs/*)
echov arXiv URL "$artid"
arXiv "${artid##*abs/}" ;;
http*://*arxiv.org/pdf/*)
echov arXiv URL "$artid"
trimurl="${artid##*pdf/}"
arXiv "${trimurl%.pdf}" ;;
http://front.math.ucdavis.edu/*)
echov Front for the arXiv URL "$artid"
arXiv "${artid##*ucdavis.edu/}" ;;
http://*ams.org/mathscinet-getitem?mr=*)
echov MathSciNet getitem URL "$artid"
msn "${artid##*mr=}" ;;
http://*ams.org/mathscinet/search/publdoc.html*mx-pid=*)
echov MathSciNet search URL "$artid"
msnid="${artid##*mx-pid=}"
msn "${msnid%%&*}" ;;
http://*ams.org/mathscinet/search/*)
echov MathSciNet generic search URL "$artid"
data="$(fetch "$artid")"
msnid="$(echo "$data" | grep mathscinet-getitem | sed 's/.*mathscinet-getitem?
mr=\([^"]*\)".*/\1/')"
msn "$msnid" ;;
https://*zbmath.org/?q=an:*)
echov zbMATH URL "$artid"
zbl "${artid##*q=an:}" ;;
https://zbmath.org/*)
echov zbMATH URL "$artid"
zbl "${artid##*zbmath.org/}" ;;
http://*doi.org/* | https://*doi.org/*)
echov DOI URL "$artid"
doi "${artid##*doi.org/}" ;;
http://*gen*md5=*)
echov Library Genesis URL "$artid"
genid="${artid##*md5=}"
gen "${genid%%&*}" ;;
http*://*citeseerx*/*doi=*)
echov CiteSeerX URL "$artid"
csid="${artid##*doi=}"
cs "${csid%%&*}" ;;
http://projecteuclid.org/euclid.*)
echov Project Euclid URL "$artid"
pe "${artid##http://projecteuclid.org/}" ;;
http*://*)
echon Unknown HTTP URL: "$artid"
echon Attempting generic full-text URL extraction
title=unknown-title
authors=unknown-authors
auti=1
fturl "$artid" ;;
arXiv:*)
arXiv "${artid:6}" ;;
MR*)
msn "${artid:2}" ;;
Zbl:*)
zbl "${artid:4}" ;;
doi:*)
doi "${artid:4}" ;;
gen:*)
gen "${artid:4}" ;;
cs:*)
cs "${artid:3}" ;;
pe:*)
pe "${artid:3}" ;;
eudml:*)
eudml "$artid" ;;
*)
fatal "Unrecognized article ID: $artid" ;;
esac
stripp() {
sed 's/\[[^]]*\]//g;s/\\[a-zA-Z]*//g;s|/|-|g' | if [[ -v fancy ]]; then
sed 's/[[:space:]]\+/ /g;s/^ //;s/ $//' | tr \\n \\f | sed 's/\.$//g' | sed
's/\f$/. /;s/\f/, /g'
else
sed 's/.*/\L&/' | sed 's/'"'"'/\f/g;s/[[:punct:]]/
/g;s/\f/'"'"'/g;s/'"''"'//g;s/[[:space:]]\+/-/g;s/^-//;s/-$//' | tr \\n -
fi
}
tryft() {
tname="$(mktemp /tmp/article-XXX)"
echon Temporary file name: "$tname"
if retft "$1" "$tname"; then
if [ -s "$tname" ]; then
echon Successfully downloaded "$1"
fqname="$name.$dextn"
echon Moving "$tname" to "$fqname"
mv "$tname" "$fqname"
if [[ -v cmd ]]; then
echon Launching "$cmd" "${args[@]:+${args[@]}}" "$fqname"
"$cmd" "${args[@]:+${args[@]}}" "$fqname"
fi
exit 0
else
echon Downloaded an empty file, skipping.
fi
fi
}