mirror of
https://github.com/papers-we-love/papers-we-love.git
synced 2024-10-27 20:34:20 +00:00
18cd311325
If there is another folder with the same name in your path when running the script, it will enter that folder and recursively try to download all the files. It the folder is big enough, it can hang the computer. Co-authored-by: Gonzalo Bella <gonzalobella@gambitresearch.com>
47 lines
1.2 KiB
Bash
Executable File
47 lines
1.2 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
# Guard clause check if required binaries are installed
|
|
which wget > /dev/null || { echo "Error: wget not installed." ; exit 1 ; }
|
|
which egrep > /dev/null || { echo "Error: egrep not installed." ; exit 1 ; }
|
|
|
|
# Recursively traverse directories in repo scraping markdown file for URLs
|
|
# containing pdfs. Downloads pdfs into respective directories.
|
|
download_for_directory() {
|
|
cd $1 || { echo "Error: directory not found." ; exit 1 ; }
|
|
|
|
for f in *; do
|
|
if [[ -d ${f} ]]; then
|
|
download_for_directory "./${f}" &
|
|
fi
|
|
done
|
|
|
|
# Scrape URLs from markdown files
|
|
urls=$(ls | cat *.md 2> /dev/null | egrep -o 'https?://[^ ]+' | grep '\.pdf' | tr -d ')')
|
|
|
|
for url in "$urls"; do
|
|
# Ignore empty URLs
|
|
if [[ ! -z ${url} ]]; then
|
|
wget ${url} --no-clobber --quiet --timeout=5 --tries=2
|
|
fi
|
|
done
|
|
|
|
cd ..
|
|
echo "$1 done."
|
|
}
|
|
|
|
# If no directories are supplied, iterate over the entire repo.
|
|
if [[ "$#" -eq 0 ]]; then
|
|
REPO_ROOT_DIR="$(dirname $0)/.."
|
|
download_for_directory ${REPO_ROOT_DIR}
|
|
else
|
|
# Iterate over the specified directories
|
|
for dir in "$@"
|
|
do
|
|
download_for_directory ${dir}
|
|
done
|
|
fi
|
|
|
|
# Wait for child processes to terminate
|
|
wait
|
|
|