import os
import urllib.request as ur
from bs4 import BeautifulSoup
import re
root_url = ""
sample_file = ""
skip_dir = ""
save_dir = ""
skip_dirs = []
def download_single_file(remote_url, filename):
root_save_dir = "/Users/dph/Downloads/"
save_dir = root_save_dir + remote_url[42:]
if not os.path.exists(save_dir):
os.mkdir(save_dir)
try:
ur.urlretrieve(remote_url+filename, save_dir+filename)
except:
print("something is wrong", remote_url, filename)
def is_directory(url):
if (url.endswith('/')):
return True
else:
return False
def get_links(url):
req = ur.Request(url)
html_page = ur.urlopen(req)
soup = BeautifulSoup(html_page, "lxml")
links = []
for link in soup.find('td').find_all_next('a'):
links.append(link.get('href'))
return links
def download_resources(url):
# get link
# filter home
links = get_links(url)[1:]
print("########### Find these links ###########")
print(links, url)
# separate file or directory
for link in links:
if is_directory(link):
new_url = url + link
download_resources(new_url)
else:
print("########### Download this file ###########")
print(url, link)
download_single_file(url, link)
if __name__ == '__main__':
download_resources(root_url)