Home > AI > Language > Python >

crawl down remote directory resources

import os
import urllib.request as ur
from bs4 import BeautifulSoup
import re


root_url = ""
sample_file = ""
skip_dir = ""
save_dir = ""
skip_dirs = []




def download_single_file(remote_url, filename):
    root_save_dir = "/Users/dph/Downloads/"
    save_dir = root_save_dir + remote_url[42:]
    if not os.path.exists(save_dir):
       os.mkdir(save_dir)
    try:
        ur.urlretrieve(remote_url+filename, save_dir+filename)
    except:
        print("something is wrong", remote_url, filename)




def is_directory(url):
    if (url.endswith('/')):
        return True
    else:
        return False


def get_links(url):
    req = ur.Request(url)
    html_page = ur.urlopen(req)

    soup = BeautifulSoup(html_page, "lxml")

    links = []
    for link in soup.find('td').find_all_next('a'):
        links.append(link.get('href'))
    return links



def download_resources(url):
    # get link
    # filter home
    links = get_links(url)[1:]
    print("########### Find these links ###########")
    print(links, url)

    # separate file or directory
    for link in links:
        if is_directory(link):
            new_url = url + link
            download_resources(new_url)
        else:
            print("########### Download this file  ###########")
            print(url, link)
            download_single_file(url, link)




if __name__ == '__main__':
    download_resources(root_url)





Leave a Reply