Vous êtes sur la page 1sur 18

#!

/bin/bash
# article: a bash script to download a paper indexed by arXiv, Mathematical
Reviews, Zentralblatt MATH, CrossRef, CiteSeerX, Project Euclid
# and save it under a file name like smith-brown-an-example-of-a-model-
category.pdf.
# See the source code for the list of supported HTTP URLs.
# Copyright 2014, 2015 Dmitri Pavlov. Distributed under the terms of the GNU
Affero General Public License version 3.
version=2017-08-18

set -u -e -o pipefail

echoerr() {
>&2 echo "$@"
}

fatal() {
echoerr "$@"
exit 1
}

echon() {
if [[ ! -v quiet ]]; then echoerr "$@"; fi
}

echov() {
if [[ -v verbose ]]; then echoerr "$@"; fi
}

syn() {
fatal "Synopsis: $0" '[ options ] [ MR[0-9]* | Zbl:[0-9]*.[0-9]* | arXiv:[0-9]*.
[0-9]* | arXiv:[-a-z]*/[0-9]* | doi:.* | cs:[0-9.]* | http://.* | https://.* ]' "
-d directory: directory where to save the file
-q: quiet, do not output any diagnostics
-v: verbose, print additional diagnostics
-i: offer an interactive choice of a full-text URL to download
-u url: use url as a full text URL
-f: fancy file names
-n: dry run: print the final file name and the full text URLs, but do not
download anything
-a: extract and print abstract page URLs, do not download anything
-e command arguments --: execute a command after a successful download
-p pairing: use an AMS pairing key to access MathSciNet"
}

urls=()

while (( $# > 0 )); do


case "$1" in
-d)
shift
if (( $# == 0 )); then
fatal "Option -d requires an argument"
fi
if [[ -d "$1" ]]; then
dirname="$1"
else
fatal "No such directory: $1"
fi ;;
-q)
quiet=1 ;;
-v)
set -v
verbose=1 ;;
-i)
interactive=1 ;;
-u)
shift
if (( $# == 0 )); then
fatal "Option -u requires an argument"
fi
urls+=("$1") ;;
-f)
fancy=1 ;;
-n)
dryrun=1 ;;
-a)
abstract=1 ;;
-e)
shift
if (( $# == 0 )); then
fatal "Option -e requires arguments"
fi
cmd="$1"
args=()
shift
while (( $# > 0 )); do
if [[ "$1" == "--" ]]; then
break
fi
args+=("$1")
shift
done
if (( $# == 0 )); then
fatal "Unterminated -e option"
fi ;;
-p)
shift
if (( $# == 0 )); then
fatal "Option -p requires an argument"
fi
amspairing="$1" ;;
*)
if (( $# == 1 )); then
artid="$1"
else
fatal "Unrecognized option $1"
fi ;;
esac
shift
done
if [[ ! -v artid ]]; then
echoerr "No article id specified"
syn
fi

echon "Scientific article full text downloader by Dmitri Pavlov, version $version."
echon "To report bugs and missing features, please email me (host
math.berkeley.edu, user pavlov).
Please include the command line and the output of the script when run with the -v
option in your email.
Before submitting a bug report, please make sure that you can download the full
text using your browser;
the inability of the script to download the full text is often an indication that
you don't have a subscription.
"

echov "Supported databases: MathSciNet, zbMATH, arXiv, DOI, Library Genesis.


Email me if you want the script to support other databases.

Incomplete list of supported repositories: ScienceDirect, SpringerLink,


Taylor and Francis, Walter de Gruyter, World Scientific, SIAM, AMS, OUP,
CUP, CMS, MSP, MUSE, TAC, JSTOR, Project Euclid, NUMDAM, CEDRAM, EuDML.
Many additional repositories are supported by virtue of generic methods.
Email me if you want the script to support other repositories.
"

texsimp() {
texuni | uconv -x any-nfc
}

texstrip() {
sed 's/\\[a-zA-Z]*//g;s/[$^_{}]//g'
}

texnorm() {
texsimp | texstrip
}

fetch() {
echov Fetching "$@"
cline=(curl -s -S -f)
if [[ -v amspairing ]]; then
case "${@:$#}" in
*.ams.org/*)
cline+=("-b" "amspairing=$amspairing") ;;
esac
fi
"${cline[@]}" "$@"
}

fetchc() {
fetch -b /dev/null "$@"
}

fetchr() {
fetch -L "$@"
}

fetchcr() {
fetchc -L "$@"
}

sfetch() {
fetch -o /dev/null -I -w "%{redirect_url}\n" "$@"
}
fetchz() {
data="$(fetch "$@")"
while [[ $data == *captcha* ]]; do
echon zbMATH demands a CAPTCHA, which means that no subscription is available.
Manual entry.
id="$(printf '%s\n' "$data" | grep captcha_id | sed 's/.*value="\
([^"]*\)".*/\1/')"
show "https://zbmath.org/captcha/$id"
echo Enter zbMATH CAPTCHA:
read -r captcha
echov Entered CAPTCHA: "$captcha"
data="$(fetch -F captcha_id="$id" -F captcha_solution="$captcha" "$@")"
done
printf '%s\n' "$data"
}

retft() {
local pdf="$1"
echon Attempting to retrieve the full text using URL "$pdf"
local tname="$2"

local jar="/dev/null"
local -a addopts=("-L")
case "$pdf" in
http://www.jstor.org/*)
jar="$(mktemp /tmp/article-XXX)"
echov Special cookie treatment for JSTOR
echov Cookie jar file: "$jar"
fetchcr -c "$jar" -I "$pdf"
pdf="$pdf"'?acceptTC=true' ;;
http*://projecteuclid.org/*)
echov Publisher: Project Euclid
echov Adding referrer
addopts+=("-e" "https://projecteuclid.org") ;;
http*://*cms.math.ca/*)
echov Publisher: CMS
echov Adding referrer
addopts+=("-e" "https://cms.math.ca/") ;;
http://libgen.*)
echov Repository: Library Genesis
echov Adding referrer
addopts+=("-e" "http://libgen.io/scimag/ads.php") ;;
http://booksc.*)
echov Repository: Library Genesis BookSC
echov Adding referrer
addopts+=("-e" "http://booksc.org") ;;
http://*sciencedirect*)
echov Repository: Elsevier
echov Adding user agent
addopts+=("-A" "Mozilla") ;;
http*://*ams.org/*)
echov Publisher: AMS
if [[ -v amspairing ]]; then
echov Adding pairing key
jar="amspairing=$amspairing"
else
echov No pairing key specified
fi ;;
esac

cline=(curl -b "$jar" "${addopts[@]}" "$pdf" -w '%{content_type}\n%{http_code}\n'


-o "$tname")
echon Command line for full text download: "${cline[@]}"
result="$("${cline[@]}")"
#result="$(curl -b "$jar" -e ";auto" -L "$pdf" -w '%{content_type}\n%
{http_code}\n' -o "$tname")"
type="$(echo "$result" | head -1)"
echov Content-Type: "$type"
code="$(echo "$result" | tail -1)"
echov HTTP code: "$code"

case "$code" in
200) ;;
401)
echon "HTTP code 401 (Unauthorized) most likely means that you have no
subscription to this resource."
echon "Check whether you have a subscription; if you can successfully download
the full text file, please email me."
return 1 ;;
404)
echon "HTTP code 404 (Not Found) usually means that there is a bug in the
script, unless downloading from one of Library Genesis mirrors."
echon "Please email me the command line and the output of the script so that I
can fix the bug."
return 1 ;;
*)
echon "Error: HTTP code is $code, not 200, downloaded file ignored."
echon "Check whether you have a subscription; if you can successfully download
the full text file, please email me."
return 1 ;;
esac

case "$type" in
application/pdf) echon PDF file; dextn="pdf" ;;
application/pdf*) echon PDF file from JSTOR; dextn="pdf" ;; # JSTOR server is run
by incompetent people
text/pdf) echon PDF file from CUP; dextn="pdf" ;; # CUP server is also run by
incompetent people
application/postscript) echon PostScript; dextn="ps" ;;
image/vnd.djvu) echon DjVu; dextn="djvu" ;;
application/x-dvi) echon DVI; dextn="dvi" ;;
application/x-tar) echon DVI in a TAR file; dextn="tar" ;;
application/octet-stream|application/download*)
if [[ -v extn ]]; then
echon File from Library Genesis, extension "$extn"
dextn="$extn"
else
echon "No extension supplied for application/octet-stream. Report this error
to me by email."
return 1
fi ;;
*)
echon "Unrecognized Content-Type: not PDF, PostScript, DjVu, or DVI, downloaded
file ignored."
echon "This might mean that you have no subscription to this content, because
many scientific repositories use a brain-damaged way to report authorization
errors."
echon "If you do have a subscription and can successfully download and view the
full text file, please email me."
return 1 ;;
esac
echon Extension: "$dextn"
}

declare -A xtried

fturl() {
if [[ -v abstract ]]; then
echo "$1"
return
fi
local url="$1"
echon Abstract page URL: "$url"

set +e # allow curl to fail so that doi links still get processed
meta="$(fetchcr -I -A / -w "%{content_type}\n%{url_effective}\n" "$url")"
set -e
ctype="$(echo "$meta" | tail -2 | head -1)"
echon Content-Type: "$ctype"
crurl="$(echo "$meta" | tail -1)"
echon Completely resolved URL: "$crurl"
if [[ "$ctype" == "application/pdf" ]]; then
urls+=("$crurl")
echov "Content-Type is application/pdf; added direct PDF URL $crurl"
data=""
else
set +e
data="$(fetchcr -A / "$url")"
set -e
fi
if [[ -v auti ]]; then if echo "$data" | grep -Ec 'doi.org/|"doi"' >/dev/null;
then
dois="$(echo "$data" | sed -n 's@.*doi.org/\([^ "<]*\).*@\1@p;s/"doi":"\
([^"]*\)"/\1/p')"
echov Extracted DOIs:
echov "$dois"
doi "$(echo "$dois" | head -1)"
return
fi; fi
if echo "$data" | grep -c citation_pdf_url >/dev/null; then
echon 'Generic method (citation_pdf_url), e.g., AMS, EMS, Project Euclid, CUP,
OUP, Springer, de Gruyter, Wiley'
local pdf
pdf="$(echo "$data" | tr \\n \ | sed -n 's@.*\
(<[^>]*citation_pdf_url[^>]*>\).*@\1@p' | sed -n 's@.*content=[^"]*"\
([^"]*\)".*@\1@p')"
echon citation_pdf_url: "$pdf"
case "$pdf" in
http://onlinelibrary.wiley.com/doi/*)
echov Publisher: Wiley
pdf="$(fetch "$pdf" | sed -n 's@.*id="pdfDocument" src="\([^"]*\)".*@\1@p' |
sed 's/&amp;/\&/g')"
if [[ -z "$pdf" ]]; then
echon 'No Wiley PDF URL found; possible cause: no subscription'
unset pdf
else
echov Adjusted Wiley PDF URL: "$pdf"
fi ;;
http://journals.cambridge.org/*)
echov Publisher: CUP
pdf="$(sfetch "$pdf")""&toPdf=true"
echov Adjusted CUP PDF URL: "$pdf" ;;
http://*)
echov Generic HTTP URL ;;
https://*)
echov Generic HTTPS URL ;;
*)
echov Generic relative URL
urlbase="$(sfetch "$url" | sed -n 's@^\(http.*//[^/]*\).*@\1@p')"
echov Base "$urlbase"
pdf="$urlbase$pdf"
echov Adjusted URL: "$pdf" ;;
esac
case "$data" in
*"Duke Mathematical Journal"*)
echov Switching to the nonenhanced PDF for the Duke Mathematical Journal due
to an insane color scheme
pdf="${pdf//pdfview/pdf}" ;;
esac
if [[ -v pdf ]]; then
urls+=("$pdf")
echon Added citation_pdf_url "$pdf"
fi
fi
if [[ -v auti ]]; then
echov Attempting to extract title and authors from the HTML
data="$(echo "$data" | iconv -f "$(echo "$data" | file -b --mime-encoding -)"
-t utf-8 -c)"
if echo "$data" | grep -c citation_title >/dev/null; then
title="$(echo "$data" | tr \\n \ | sed -n 's@.*\
(<[^>]*citation_title[^>]*>\).*@\1@p' | tee | sed -n 's@.*content=[^"]*"\
([^"]*\)".*@\1@p')"
echov citation_title: "$title"
fi
if echo "$data" | grep -c citation_author >/dev/null; then
authors="$(echo "$data" | sed -n 's@.*\(<[^>]*citation_author[^>]*>\).*@\1@p'
| sed -n 's@.*content=[^"]*"\([^"]*\)".*@\1@p')"
echov citation_author: "$authors"
if echo "$authors" | grep ,; then
authors="$(echo "$authors" | sed 's/,.*//')"
echov Author last names before commas: "$authors"
else
authors="$(echo "$authors" | sed 's/.* \([^ ]*\)/\1/')"
echov Author last names: "$authors"
fi
fi
fi
local pdf
case "$url" in
http://*doi.org/* | https://*doi.org/*)
doi="${url##*doi.org/}"
echon DOI: "$doi"
echov DOI URL: "$url"
rurl="$(sfetch "$url")"
echon Resolved DOI: "$rurl" ;;
*)
rurl="$url" ;;
esac
url="$rurl"

case "$rurl" in
http://mr.crossref.org/*)
echon CrossRef fork
links="$(fetch "$rurl" | { grep -o "href=['\"][^'\"]*['\"]" || true; } | sed
's/href=.//;s/.$//' | { grep -Ev '^https?://.*doi.org' || true; } | uniq)"
echon Detected links:
echon "$links"
for i in $links; do
echon Recursively processing link "$i"
fturl "$i"
done ;;
http://linkinghub.elsevier.com/retrieve/pii/*)
echov Publisher: Elsevier
eid="${rurl:44}"
echov Elsevier ID: "$eid"
#pdf="$(echo "$data" | sed -n 's/.*pdfurl="\([^"]*\)".*/\1/p')"
eapdata="$(fetch -A / "http://www.sciencedirect.com/science/article/pii/$eid")"
#pdf="$(echo "$eapdata" | sed -n 's/.*pdfurl="\([^"]*\)".*/\1/p')"
pdf="$(echo "$eapdata" | sed -n 's@.*<a class="pdf-download-btn-link" href="\
([^"]*\)".*@https://www.sciencedirect.com\1@p' | sed 's/&amp;/\&/g')"
if [[ -z "$pdf" ]]; then
echon 'No Elsevier PDF URL found.'
pdf="$(echo "$eapdata" | tr \\n @ | sed -n 's/.*pdf-download-link"@ *href="\
([^"]*\)".*/\1/p')"
if [[ -z "$pdf" ]]; then
echon 'No Elsevier pdf-download-link found.'
echon 'Possible cause: no subscription.'
echon 'If you do have subscription, please email me.'
unset pdf
fi
fi
if [[ -v pdf && $pdf == //* ]]; then
pdf="http:$pdf"
echov Adjusting // Elsevier URL to "$pdf"
fi ;;
http://www.tandfonline.com/doi/abs/*)
echov Publisher: Taylor and Francis
pdf="${rurl//abs/pdf}" ;;
http://www.worldscientific.com/doi/abs/*)
echov Publisher: World Scientific
pdf="${rurl//abs/pdfplus}" ;;
http://epubs.siam.org/doi/abs/*)
echov Publisher: SIAM
pdf="${rurl//abs/pdf}" ;;
http://www.msp.warwick.ac.uk/*)
echov Publisher: MSP
pdf=http://www.msp.warwick.ac.uk"$(echo "$data" | sed -n 's/.*"\
([^"]*[sp]\.pdf\)".*/\1/p' | head -1)" ;;
http://*journalofsing.org/*)
echov Publisher: Journal of Singularities
pdf="${rurl%/*}/$(fetch "$rurl" | sed -n 's@.*href="\([^"]*pdf\)"
style.*@\1@p')" ;;
http://www.jstor.org/*)
echov Publisher: JSTOR
pdf=http://www.jstor.org/stable/pdfplus/${doi:8}.pdf ;;
http://www.cms.math.ca/*)
echov Publisher: CMS
pdf="http://cms.math.ca$(echo "$data" | sed -n 's@.*<a href="\([^"]*\)">Read
article</a>.*@\1@p')" ;;
http://www.intlpress.com/*)
echov Publisher: International Press
pdf="$rurl"$(fetch "$rurl/body.html" | sed -n 's@.*"\([^"]*.pdf\)".*@\1@p') ;;
http://*.impan.pl/cgi-bin/doi*)
echov Publisher: IMPAN
pdf="${rurl//\/doi/\/pdf}"
if [[ "${pdf: -2:1}" == "-" ]]; then
pdf="${pdf:0:-2}-0${pdf: -1:1}"
fi ;;
http://retro.seals.ch/digbib/view?rid=*)
echov Publisher: retro.seals
pdf="${rurl//digbib\/view?rid=/cntmng?pid=}" ;;
# end of DOI URLs
http://www.numdam.org/item?id=*)
echov Publisher: Numdam
numdam="${url:30}"
echov Numdam ID: "$numdam"
pdf="http://archive.numdam.org/article/$numdam.pdf" ;;
http://*.cedram.org/item?id=*)
echov Publisher: Cedram
numdam="${url#*item?id=}"
echov Numdam ID: "$numdam"
pdf="http://archive.numdam.org/article/$numdam.pdf" ;;
http*://eudml.org/*)
echov Publisher: EuDML
pdf="$(echo "$data" | sed -n "s@.*<a href='\\([^']*\\)'"' title=""
target="_blank" rel="nofollow">Full (PDF)</a>.*@\1@p')"
if [ -z "$pdf" ]; then
link="$(echo "$data" | sed -n "s@.*<a href='\\([^']*\\)'"' title=""
target="_blank" rel="nofollow">Access to full text</a>.*@\1@p')"
echov Intermediate link: "$link"
fturl "$link"
fi ;;
http://muse.jhu.edu/*.pdf)
echov Publisher: MUSE
pdf="$url" ;;
http://www.emis.de/*abs.html)
echov Publisher: EMIS
pdf="${url//abs.html/.pdf}" ;;
http://www.emis.de/*.html)
echov Publisher: EMIS
pdf="${url//html/pdf}" ;;
http://www.digizeitschriften.de/dms/*)
echov Publisher: DigiZeitschriften
link="${url//resolveppn/img}"
echov PDF page for DigiZeitschriften: "$link"
data="$(fetch "$link")"
pdf="$(echo "$data" | sed -n 's@.*<a href="\([^"]*\)"
class="maintitle_pdf">.*@\1@p')" ;;
http://gdz.sub.uni-goettingen.de/dms/resolveppn/*)
echov Publisher: GDZ "$url"
data="$(fetchr "$url")"
pdf="$(echo "$data" | sed -n 's@.*<a href="\([^"]*\)"
class="maintitle_pdf">.*@\1@p')" ;;
http://*tac.mta.ca*)
echov Publisher: TAC
pdf="${url//abs.html/.pdf}" ;;
http://www.pnas.org/cgi/doi/*)
echov Publisher: PNAS
pdf="$crurl" ;;
http://tcms.org.ge/*)
echov Publisher: TCMS
volume="$(echo "$data" | sed -n 's@.*Vol. \([^(]*\)(.*@\1@p')"
echov Volume: "$volume"
trim="${url%/abstract.htm}"
echov Trimmed URL: "$trim"
stem="${trim##*/}"
echov URL stem: "$stem"
pdf="${trim//volumes/xvolumes}/v${volume}${stem}hl.pdf" ;;
http://*mathematik.uni-bielefeld.de/documenta/*)
echov Publisher: Documenta Mathematica
pdf="${url//html/pdf}" ;;
http://d-nb.info/*)
echov Publisher: DNB
pdf="$url" ;;
*)
echov Unknown URL "$url"
echov If the script is unable to download the full text, please email me so
that I can add support for this type of URL. ;;
esac
if [[ -v pdf ]]; then
echon Added publisher URL "$pdf"
urls+=("$pdf")
fi

url="$1"
case "$url" in
http://*doi.org/* | https://*doi.org*)
doi="${url##*doi.org/}"
mapfile -t arxivurls < <(fetch http://export.arxiv.org/api/query --data-
urlencode "search_query=doi:\"$doi\"" | xidel - -s -e
"//feed/entry/link[@title='pdf']/@href" | sed '/^$/d')
if [[ -v arxivurls ]]; then
echon Added arXiv URLs obtained by DOI:
echon "${arxivurls[@]}"
urls+=("${arxivurls[@]}")
fi

mapfile -t scpdfurls < <(fetch http://booksc.org/s/ --data-urlencode "q=$doi"


--data e=1 | sed -n 's@.*href="\(http://booksc.org/dl/[^"]*\)".*@\1@p')
if [[ -v scpdfurls ]]; then
urls+=("${scpdfurls[@]}")
extn="pdf"
echon Added Library Genesis BookSC DOI URLs "${scpdfurls[@]}" with extension
"$extn"
else
echon BookSC search unsuccessful.
fi

lgpdf="$(fetch -G "http://libgen.io/scimag/ads.php" --data-urlencode "doi=$doi"


| grep scimag/get | sed "s/.*href='.*\(http:[^']*\)'.*/\1/g" ||:)"
if [[ -v lgpdf ]]; then
urls+=("$lgpdf")
extn="pdf"
echon Added Library Genesis DOI URL "$lgpdf" with extension "$extn"
fi ;;
esac

case "$crurl" in
http://link.springer.com/book/*)
echon 'Publisher: Springer (book)'
echon Springer books are typically split into many individual files, which does
not fit the operational model of this script. Aborting.
return ;;
esac

echon Attempting to extract raw URLs from the abstract page "$crurl":
mapfile -t newurls < <(echo "$data" | xidel - -s -e "(//@href, //@src)/resolve-
uri(.,\"$crurl\")" | sed 's/#.*//' | grep pdf | grep -v "degruyter.com/flyer/\|
degruyter.com/.*.toc.xml\|degruyter.com/.*.fm.xml\|ams.org/publications/\|
ams.org/firefox\|endmatter\|msp.org/forms\|math.ca/Membership\|math.ca/Docs\|
math.ca/.*/abstract/\|pdf-preview\|/marketing/\|\.gif$")
for i in "${!newurls[@]}"; do
if [[ ! ${xtried["${newurls["$i"]}"]+_} ]]; then
vnewurls+=("${newurls[$i]}")
xtried["${newurls["$i"]}"]=1
fi
done
if [[ -v vnewurls ]]; then
echon Last resort: some more PDF urls from the abstract page:
echon "${vnewurls[@]}"
urls+=("${vnewurls[@]}")
echon Warning: some publishers link irrelevant PDFs from the abstract page,
e.g., license agreements, abstracts, etc.
echon If the script ends up downloading such an irrelevant PDF, please email me
so that I can add it to the list of exceptions.
fi
}

arXiv() {
echon Database: arXiv "$1"
data="$(fetch http://export.arxiv.org/api/query?id_list="$1" | tr \\n \ | sed
's@<[^/][^>]*/>@\n&\n@g;s@<[^/][^>]*[^/>]>@\n&@g;s@</[^>]*>@&\n@g')"
echov Processed output:
echov "$data"
arxiverr="$(echo "$data" | sed -n
'\@^<id>http://arxiv.org/api/errors.*</id>$@{p;q1}')"
if [[ ! -z "$arxiverr" ]]; then echon "$arxiverr"; fi
id="$(echo "$data" | sed -n 's@^<id>http://arxiv.org/abs/\(.*\)</id>$@\1@p')"
echov arXiv ID: "$id"
title="$(echo "$data" | sed -n 's@^<title>\(.*\)</title>$@\1@p')"
authors="$(echo "$data" | sed -n 's@^<name>\(.*\)</name>$@\1@p' | sed 's/.* \
([^ ]*\)/\1/')"
urls+=("http://arxiv.org/pdf/$id.pdf")
}

msn() {
echon Database: MathSciNet "$1"
data="$(fetch http://www.ams.org/mathscinet/search/publications.html?
fmt=endnote\&pg1=MR\&s1=MR"$1" | sed -n '1,/<pre>/d;/<\/pre>/,$d;p')"
echov EndNote:
echov "$data"
#data="$(echo "$data" | tr \\n \\t | sed 's/\t //g' | tr \\t \\n)"
title="$(echo "$data" | tr \\n @ | sed -n 's/.*@%T \([^%]*\)@ *%.*/\1/p' | tr @ \
| texnorm)"
echov Title: "$title"
authors="$(echo "$data" | sed -n 's/^%A //p' | sed 's/\([^,]*\),.*/\1/' |
texnorm)"
echov Authors: "$authors"

local url
url="$(echo "$data" | sed -n 's/^%U //p')"
if [ -z "$url" ]; then
echov No URL found in EndNote data, attempting to extract a URL from the HTML
file
hdata="$(fetch http://www.ams.org/mathscinet-getitem?mr=MR"$1" | { grep -A 31
"<strong>MR0*$1</strong>" || true; })"
echov Processed output:
echov "$hdata"
#authors="$(echo "$hdata" | sed 's/<a href/\n<a href/g;s@</a>@</a>\n@g' | sed
-n 's@<a href="/mathscinet/search/author.html?mrauthid=[^"]*">\([^<]*\)</a>@\1@p' |
sed 's/\([^,]*\),.*/\1/')"
#title="$(echo "$hdata" | tr \\n \ | sed -n 's@.*<span class="title">\
([^<]*\)</span>.*@\1@p')"

url="$(echo "$hdata" | sed -n 's@.*<a target="NEW" href="/leavingmsn?url=\


([^"]*\)">\(Article\|Chapter\|Book\)</a>.*@\1@p')"
if [ -z "$url" ]; then
case "$data" in
*"%@ 1201-561X"*)
echov Journal: Theory and Applications of Categories
volume="$(echo "$data" | sed -n 's/^%V //p')"
number="$(echo "$data" | sed -n 's/^%P [^0-9]*\([0-9]*\),.*/\1/p')"
echov Volume "$volume", number "$number"
stem="$number.pdf"
if (( volume < 10 )); then
stem="n$stem"
if (( volume == 1 )); then
stem="v1$stem"
fi
stem="n$number/$stem"
else
if (( number < 10 )); then
stem="0$stem"
fi
stem="$number/$volume-$stem"
fi
if (( volume < 6 )); then
((volume+=1994))
fi
stem="$volume/$stem"
echov Stem "$stem"
urls+=("http://tac.mta.ca/tac/volumes/$stem")
return ;;
*"%@ 1431-0635"*)
echov Journal: Documenta Mathematica
volume="$(echo "$data" | sed -n 's/^%V //p')"
pages="$(echo "$data" | sed -n 's/^%P //p')"
echov "Volume $volume, $pages"
url="http://mathematik.uni-bielefeld.de/documenta/vol-$volume/"$(fetch
"http://mathematik.uni-bielefeld.de/documenta/vol-$volume/vol-$volume.html" |
tr \\n @ | sed -n "s|.* $pages@"'[^@]*<A HREF="\([^"]*\)">Abstract</A>.*|\1|p') ;;
*)
echon 'No full text URL supplied by MathSciNet. Try zbMATH, sometimes it
gives a full text URL when MathSciNet does not.'
return ;;
esac
fi
fi
fturl "$url"
}

zbl() {
echon Database: zbMATH "$1"
data="$(fetchz https://zbmath.org/?q=an:"$1")"
#authors="$(echo "$data" | sed 's/<a href/\n<a href/g;s@</a>@</a>\n@g' | sed -n
's@<a href="authors/?q=ai:[^"]*" title="Author Profile">\([^<]*\)</a>@\1@p' | sed
's/\([^,]*\),.*/\1/')"
#title="$(echo "$data" | sed -n 's@.*<div class="title">\([^<]*\)<.*@\1@p')"
url="$(echo "$data" | sed -n 's@.*<a class="btn btn-default btn-xs" type="button"
href="\([^"]*\)".*@\1@p')"
#bibtexurl="$(echo "$data" | sed -n 's@.*<a class="btn btn-mini bib" data-
container="body" type="button" href="\([^"]*\)".*@\1@p')"
bibtexurl="https://zbmath.org/bibtex/$1.bib"
echov zbMATH BibTeX URL: "$bibtexurl"
data="$(fetchz "$bibtexurl")"
echov zbMATH BibTeX:
echov "$data"
authors="$(echo "$data" | sed -n 's@^ Author = {\(.*\)},$@ \1 @p' | grep -Eo '
{([^{}]|({[^}]*}))*} ' | sed 's/^..//;s/..$//' | texsimp)"
echov BibTeX authors: "$authors"
title="$(echo "$data" | sed -n 's@^ Title = {{\(.*\)}},$@\1@p' | texnorm)"
echov BibTeX title: "$title"
if [ -z "$url" ]; then
echon 'No full text URL supplied by zbMATH. Try MathSciNet, sometimes it gives
a full text URL when zbMATH does not.'
return
fi
while read -r iurl; do
echon Trying zbMATH abstract page URL "$iurl"
fturl "$iurl"
done <<< "$url"
}

urldecode() {
a="$(cat)"
printf '%b' "${a//%/\\x}"
}

doi() {
echon Database: CrossRef "$1"
data="$(fetch "http://search.crossref.org/dois?q=$1")"
echov CrossRef data: "$data"
title="$(echo "$data" | sed -n 's@ "title": "\(.*\)",@\1@p')"
echov CrossRef title: "$title"
authors="$(echo "$data" | sed 's@&amp;@\n@g;s@",$@@' | sed -n
's/rft.au=+//p;s/rft.au=//p' | tr + \ | urldecode)"
echov CrossRef authors field: "$authors"
if echo "$authors" | grep '&'; then
authors="$(echo "$authors" | sed 's/&/\n/g' | sed 's/^ *//;s/ *$//')"
else
authors="$(echo "$authors" | sed 's/.* \([^ ]*\)/\1/')"
fi
echov CrossRef authors: "$authors"
fturl "https://doi.org/$1"
}

cs() {
echon Database: CiteSeerX "$1"
data="$(fetch "http://citeseerx.ist.psu.edu/oai2?
verb=GetRecord&metadataPrefix=oai_dc&identifier=oai:CiteSeerX.psu:$1")"
title="$(echo "$data" | sed -n 's@.*<dc:title>\(.*\)</dc:title>.*@\1@p')"
authors="$(echo "$data" | sed -n 's@.*<dc:creator>\(.*\)</dc:creator>.*@\1@gp' |
sed 's/.* \([^ ]*\)/\1/')"
urls+=("http://citeseerx.ist.psu.edu/viewdoc/download?doi=$1&rep=rep1&type=pdf")
}

pe() {
echon Database: Project Euclid "$1"
data="$(fetch http://projecteuclid.org/export_citations --data format=ris --data-
urlencode "h=$1")"
echov Project Euclid bibliographic data:
echov "$data"
title="$(echo "$data" | sed -n 's/TI - //p')"
authors="$(echo "$data" | sed -n 's/AU - //p' | sed 's/\([^ ]*\).*/\1/')"
urls+=("http://projecteuclid.org/download/pdf_1/$1")
}

eudml() {
echon Database: EuDML "$1"
data="$(fetch "https://eudml.org/api/rest/urn:$1?format=oai_dc")"
echov EuDML data:
echov "$data"
title="$(echo "$data" | sed -n 's|.*>\([^>]*\)</dc:title>.*|\1|p' | head -1)"
authors="$(echo "$data" | sed -n 's|.*>\([^>]*\)</dc:creator>.*|\1|p')"
if echo "$authors" | grep -c , >/dev/null; then
authors="$(echo "$authors" | sed 's/\([^ ,]*\).*/\1/')"
else
authors="$(echo "$authors" | sed 's/.* \([^ ]*\)/\1/')"
fi
fturl "https://eudml.org/doc/${1##eudml:doc:}"
}

gen() {
echon Database: Library Genesis "$1"
data="$(fetch "http://libgen.io/book/index.php?md5=$1")"
isbn=($(echo "$data" | sed -n 's@.*ISBN:</font></td><td>\([^<]*\)</td>.*@\1@p' |
tr -cs 0-9- \ ))
if [[ ${#isbn[@]} -ne 0 ]]; then
echov ISBNs from Library Genesis: "${isbn[@]}"
fi
title="$(echo "$data" | sed -n 's@.*Title: </font></nobr></td><td colspan=2><b><a
href=[^>]*>\([^<]*\)</a>.*@\1@p')"
authors="$(echo "$data" | sed -n 's@.*Author(s):</font></nobr></td><td
colspan=3><b>\([^<]*\)</b>.*@\1@p' | sed 's/(auth.)//g' | sed 's/, /\n/g' | sed
's/^ *//;s/ *$//' | sed -n 's/.* \([^ ]*\)/\1/p')"
for i in "${!isbn[@]}"; do
echov Trying ISBN "${isbn[$i]}" with WorldCat
wdata="$(fetch "http://xisbn.worldcat.org/webservices/xid/isbn/${isbn[$i]}?
method=getMetadata&format=json&fl=*")"
echov ISBN bibliographic data from WorldCat: "$wdata"
if [[ "ok" != "$(echo "$wdata" | jq -r .stat)" ]]; then
continue
fi
if ! authors="$(echo "$wdata" | jq -e -r '.list[0].author' | sed 's/\.
$//;s/ ; /\n/g;s/ and /\n/g;s/, /\n/g' | sed 's/.* \([^ ]*\)/\1/')"; then
oclc="$(echo "$wdata" | jq -e -r '.list[0].oclcnum[0]')"
echov OCLC number: "$oclc"
wwdata="$(fetchr "http://www.worldcat.org/oclc/$oclc?
page=endnote&client=worldcat.org-detailed_record")"
echov EndNote bibliographic data from WorldCat: "$wwdata"
authors="$(echo "$wwdata" | sed -n 's/^AU - //p' | sed 's/\(.*\),.*/\1/')"
fi
echov Authors from WorldCat: "$authors"
title="$(echo "$wdata" | jq -r .list[0].title)"
echov Title from WorldCat: "$title"
if [[ -n "$authors" && -n "$title" ]]; then
break
fi
done
if [[ -z "$authors" || -z "$title" ]]; then
for i in "${!isbn[@]}"; do
echov Trying ISBN "${isbn[$i]}" with Google Books
bdata="$(fetch "https://www.googleapis.com/books/v1/volumes?q=isbn+$
{isbn[$i]}&fields=items/volumeInfo(title,authors)&maxResults=1")"
echov ISBN bibliographic data from Google Books: "$bdata"
if [[ "$bdata" = "{}" ]]; then
continue
fi
authors="$(echo "$bdata" | jq -r .items[0].volumeInfo.authors[] | sed 's/.* \
([^ ]*\)/\1/')"
echov Authors from Google Books: "$authors"
title="$(echo "$bdata" | jq -r .items[0].volumeInfo.title)"
echov Title from Google Books: "$title"
if [[ -n "$authors" && -n "$title" ]]; then
break
fi
done
fi
extn="$(echo "$data" | sed -n 's@.*Extension:</font></nobr></td><td>\
([^<]*\)</td>.*@\1@p')"
id="$(echo "$data" | sed -n 's@.*ID:</font></nobr></td><td>\
([^<]*\)</td>.*@\1@p')"
echov Library Genesis extension: "$extn"
echov Library Genesis ID: "$id"
urls+=("http://libgen.me/noleech1.php?hidden=${id::-3}000//${1,,}") # ${id::-
3}000/
urls+=("http://dl.b-ok.org/genesis/${id::-3}000/${1,,}/_as/") # ${id::-3}000/
urls+=("http://dlx.b-ok.org/genesis/${id::-3}000/${1,,}/_as/")
urls+=("http://libgen.io/get/$1/name.pdf")
urls+=("http://userbooks.bookfi.org/2/$1")
}

echon Article ID: "$artid"

case "$artid" in
http*://*arxiv.org/abs/*)
echov arXiv URL "$artid"
arXiv "${artid##*abs/}" ;;
http*://*arxiv.org/pdf/*)
echov arXiv URL "$artid"
trimurl="${artid##*pdf/}"
arXiv "${trimurl%.pdf}" ;;
http://front.math.ucdavis.edu/*)
echov Front for the arXiv URL "$artid"
arXiv "${artid##*ucdavis.edu/}" ;;
http://*ams.org/mathscinet-getitem?mr=*)
echov MathSciNet getitem URL "$artid"
msn "${artid##*mr=}" ;;
http://*ams.org/mathscinet/search/publdoc.html*mx-pid=*)
echov MathSciNet search URL "$artid"
msnid="${artid##*mx-pid=}"
msn "${msnid%%&*}" ;;
http://*ams.org/mathscinet/search/*)
echov MathSciNet generic search URL "$artid"
data="$(fetch "$artid")"
msnid="$(echo "$data" | grep mathscinet-getitem | sed 's/.*mathscinet-getitem?
mr=\([^"]*\)".*/\1/')"
msn "$msnid" ;;
https://*zbmath.org/?q=an:*)
echov zbMATH URL "$artid"
zbl "${artid##*q=an:}" ;;
https://zbmath.org/*)
echov zbMATH URL "$artid"
zbl "${artid##*zbmath.org/}" ;;
http://*doi.org/* | https://*doi.org/*)
echov DOI URL "$artid"
doi "${artid##*doi.org/}" ;;
http://*gen*md5=*)
echov Library Genesis URL "$artid"
genid="${artid##*md5=}"
gen "${genid%%&*}" ;;
http*://*citeseerx*/*doi=*)
echov CiteSeerX URL "$artid"
csid="${artid##*doi=}"
cs "${csid%%&*}" ;;
http://projecteuclid.org/euclid.*)
echov Project Euclid URL "$artid"
pe "${artid##http://projecteuclid.org/}" ;;
http*://*)
echon Unknown HTTP URL: "$artid"
echon Attempting generic full-text URL extraction
title=unknown-title
authors=unknown-authors
auti=1
fturl "$artid" ;;
arXiv:*)
arXiv "${artid:6}" ;;
MR*)
msn "${artid:2}" ;;
Zbl:*)
zbl "${artid:4}" ;;
doi:*)
doi "${artid:4}" ;;
gen:*)
gen "${artid:4}" ;;
cs:*)
cs "${artid:3}" ;;
pe:*)
pe "${artid:3}" ;;
eudml:*)
eudml "$artid" ;;
*)
fatal "Unrecognized article ID: $artid" ;;
esac

echov Title: "$title"


echov Authors:
echov "$authors"

stripp() {
sed 's/\[[^]]*\]//g;s/\\[a-zA-Z]*//g;s|/|-|g' | if [[ -v fancy ]]; then
sed 's/[[:space:]]\+/ /g;s/^ //;s/ $//' | tr \\n \\f | sed 's/\.$//g' | sed
's/\f$/. /;s/\f/, /g'
else
sed 's/.*/\L&/' | sed 's/'"'"'/\f/g;s/[[:punct:]]/
/g;s/\f/'"'"'/g;s/'"''"'//g;s/[[:space:]]\+/-/g;s/^-//;s/-$//' | tr \\n -
fi
}

title="$(echo -n "$title" | stripp)"


authors="$(echo "$authors" | stripp)"
name="$authors$title"
echov Stripped title: "$title"
echov Combined authors: "$authors"
echov Local file name without extension: "$name"

if [[ -v dirname ]]; then


echov Directory: "$dirname"
name="$dirname/$name"
echon Directory and file name without extension: "$name"
fi

if [[ -v dryrun ]]; then


echo "$name"
printf '%s\n' "${urls[@]}"
exit 0
fi

tryft() {
tname="$(mktemp /tmp/article-XXX)"
echon Temporary file name: "$tname"
if retft "$1" "$tname"; then
if [ -s "$tname" ]; then
echon Successfully downloaded "$1"
fqname="$name.$dextn"
echon Moving "$tname" to "$fqname"
mv "$tname" "$fqname"
if [[ -v cmd ]]; then
echon Launching "$cmd" "${args[@]:+${args[@]}}" "$fqname"
"$cmd" "${args[@]:+${args[@]}}" "$fqname"
fi
exit 0
else
echon Downloaded an empty file, skipping.
fi
fi
}

if [ ${#urls[@]} -eq 0 ]; then


echon No full text URLs found for "$artid"
echon Email me if you can access the full text.
exit 1
fi
if [[ -v interactive ]]; then
echo Full text URLs:
for i in "${!urls[@]}"; do
echo "$i) ${urls[$i]}"
done
if [[ "${#urls[@]}" == 1 ]]; then
echon Automatically selecting the only URL
tryft "${urls[0]}"
exit 1
fi
while true; do
read -r
if [ -z "$REPLY" ]; then
echon Nothing selected
exit 1
else
echon Selected "$REPLY": "${urls["$REPLY"]}"
tryft "${urls["$REPLY"]}"
fi
done
else
echon Full text URLs:
for i in "${!urls[@]}"; do
echon "$i) ${urls[$i]}"
done
declare -A tried
for i in "${!urls[@]}"; do
if [[ ${tried["${urls["$i"]}"]+_} ]]; then
echon Skipping the duplicate URL "$i": "${urls[$i]}"
continue
fi
echon Attempting to download full text URL "$i": "${urls[$i]}"
tried["${urls["$i"]}"]=1
tryft "${urls[$i]}"
done
echon No working full text URLs
exit 1
fi

Vous aimerez peut-être aussi