All improvements are welcome. A pure python or pure bash version would be nice.
The reason for the mix is that bash sucks at reading csv files with quotes and lynx is good for resolving javascript links.
'''
A simple crude (15 minutes) script to dowload all aptnotes files from BOX.COM
The script reads the .csv and uses lynx and grep to find the correct filepath
This script is tested on Ubuntu with lynx
sudo apt-get install lynx
'''
import csv
import os
import subprocess
import shlex
import hashlib
BASEDIR = "dl"
def getsha1(filepath):
with open(filepath, 'rb') as f:
return hashlib.sha1(f.read()).hexdigest()
def makedir(dirname):
if not os.path.exists(BASEDIR+os.sep+dirname):
os.makedirs(BASEDIR+os.sep+dirname)
def getlink(linkurl,filename):
if not os.path.exists(filename):
print "Downloading {}".format(filename)
dlcommand='lynx -dump {0} | grep -Eh --only-matching "https://[^ ]+"\
| grep "/download?shared_link=https://"\
| xargs wget -O "{1}" '.format(linkurl,filename)
print os.system(dlcommand)
else:
pass
#print("File already downloaded")
def getcsvfile():
os.system("wget https://raw.githubusercontent.com/aptnotes/data/master/APTnotes.csv")
getcsvfile()
with open("APTnotes.csv", "r") as aptnotes:
csvreader = csv.DictReader(aptnotes, delimiter=',', quotechar='"')
for row in csvreader:
makedir(row.get('Year'))
filename=BASEDIR+os.sep+row.get('Year')+os.sep+row.get('Filename')+".pdf"
getlink(row.get('Link'),filename)
if row.get('SHA-1') == getsha1(filename):
pass
#print("File {} is verified by hash".format(filename))
else:
print ("Filehash differs {}".format(filename))
print "Done"
No comments:
Post a Comment