Spaces:
Paused
Paused
| ''' | |
| # Web Scrapping | |
| [@dwancin on HuggingFace](https://huggingface.co/spaces/dwancin/web-scraping) | |
| ''' | |
| import os,re, requests, uuid, zipfile, hashlib, shutil | |
| import gradio as gr | |
| from bs4 import BeautifulSoup | |
| from urllib.parse import urljoin, urlparse | |
| # Function to validate URLs | |
| def validator(url): | |
| parsed = urlparse(url) | |
| return bool(parsed.netloc) and bool(parsed.scheme) | |
| # Function to find files on webpage | |
| def finder(url, soup, media_type): | |
| files = [] | |
| # find image files | |
| if media_type == "image": | |
| tags = ['jpg', 'jpeg', 'png', 'svg', 'gif', 'webp', 'tiff', 'psd', 'eps', 'ai', 'indd', 'raw'] | |
| for tag in soup.find_all('img'): | |
| file = tag.get('src') | |
| if any(tag in file for tag in tags): | |
| file_url = file | |
| if not validator(file_url): | |
| file_url = urljoin(url, file_url) | |
| files.append(file_url) | |
| # find text | |
| elif media_type == "text": | |
| text_tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span', 'strong', 'pdf', 'txt', 'doc', 'rtf', 'docx'] | |
| for tag in text_tags: | |
| for element in soup.find_all(tag): | |
| files.append(element.get_text()) | |
| # find links | |
| else: | |
| for link in soup.find_all('a'): | |
| file = link.get('href') | |
| if media_type in file: | |
| file_url = file | |
| if not validator(file_url): | |
| file_url = urljoin(url, file_url) | |
| files.append(file_url) | |
| return files | |
| # Function to download the files | |
| def downloader(urls, folder_name): | |
| os.makedirs(folder_name, exist_ok=True) | |
| for i, url in enumerate(urls): | |
| response = requests.get(url, stream=True) | |
| file_extension = url.split(".")[-1].split("&")[0] | |
| url_hash = hashlib.md5(url.encode()).hexdigest() | |
| unique_id = str(uuid.uuid4())[:8] | |
| file_name = f'{url_hash}-{unique_id}.{file_extension}' | |
| file_name = file_name[:255] | |
| file_name = re.sub(r'[\\/:"*?<>|]+', '_', file_name) | |
| with open(f'{folder_name}/{file_name}', 'wb') as out_file: | |
| out_file.write(response.content) | |
| print(f"Downloaded file: {file_name}") | |
| # Function to create zip file | |
| def zipper(folder_name): | |
| if os.listdir(folder_name): | |
| with zipfile.ZipFile(f'{folder_name}.zip', 'w') as zipf: | |
| for file in os.listdir(folder_name): | |
| zipf.write(f'{folder_name}/{file}') | |
| return f'{folder_name}.zip' | |
| else: | |
| return "" | |
| # Function to access website | |
| def scrapper(url, images=False, text=False): | |
| try: | |
| response = requests.get(url, timeout=10) | |
| response.raise_for_status() | |
| except (requests.exceptions.RequestException, ValueError): | |
| raise gr.Error(f"Unable to access URL: {url}") | |
| return None, None | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| # Clear all the previews folder data | |
| if images: | |
| shutil.rmtree('images', ignore_errors=True) | |
| if text: | |
| shutil.rmtree('text', ignore_errors=True) | |
| # Add images to the image folder | |
| if images: | |
| image_urls = finder(url, soup, 'image') | |
| os.makedirs('images', exist_ok=True) | |
| if image_urls: | |
| downloader(image_urls, 'images') | |
| else: | |
| raise gr.Error("Found no images.") | |
| # Add text files to the text folder | |
| if text: | |
| text_content = finder(url, soup, 'text') | |
| os.makedirs('text', exist_ok=True) | |
| if text_content: | |
| with open('text/content.txt', 'w') as text_file: | |
| for line in text_content: | |
| text_file.write(line + '\n') | |
| # Output folder(s) as zip files | |
| images_zip_file, text_zip_file = None, None | |
| if images and os.path.exists('images') and os.listdir('images'): | |
| images_zip_file = zipper('images') | |
| if text and os.path.exists('text') and os.listdir('text'): | |
| text_zip_file = zipper('text') | |
| return images_zip_file, text_zip_file | |
| # Function to find requests errors | |
| def checker(url, media_types): | |
| if not url: | |
| raise gr.Error("URL cannot be empty.") | |
| if not url.startswith("https://"): | |
| raise gr.Error("The URL must begin with https://") | |
| if not media_types: | |
| raise gr.Error("At least one media type must be selected.") | |
| try: | |
| image_file, text_file = scrapper(url, "Images" in media_types, "Text" in media_types) | |
| except requests.exceptions.HTTPError as e: | |
| if e.response.status_code == 403: | |
| raise gr.Error("HTTP Error: Forbidden. Access to the URL is forbidden.") | |
| else: | |
| raise gr.Error(f"HTTP Error: {e.response.status_code}") | |
| except TypeError as e: | |
| raise gr.Error(f"TypeError: {str(e)}") | |
| except (requests.exceptions.RequestException, ValueError): | |
| raise gr.Error(f"Unable to access URL: {url}") | |
| files = [] | |
| if "Text" in media_types and not text_file: | |
| raise gr.Error("Found no text.") | |
| if "Images" in media_types and not image_file: | |
| raise gr.Error("Found no images.") | |
| if image_file: | |
| files.append(image_file) | |
| if text_file: | |
| files.append(text_file) | |
| print(f"Returning downloaded files from {url} in {files} ...") | |
| return files | |
| # Gradio Interface | |
| with gr.Blocks(theme="Nymbo/Nymbo_Theme") as app: | |
| title = gr.Markdown('''# Web Scraping 🕵️''') | |
| description = gr.Markdown('''Get all media files from your desired webpages with just a few clicks.''') | |
| with gr.Row(): | |
| with gr.Column(scale=0, min_width=480, variant="panel", elem_id="sd-panel"): | |
| url_name = gr.Textbox( | |
| placeholder="Enter URL here", | |
| show_label=True, | |
| label="Website", | |
| ) | |
| media_types = gr.CheckboxGroup( | |
| ["Images", "Text"], | |
| value="Images", | |
| label="Media types", | |
| ) | |
| submit_button = gr.Button( | |
| "Submit", | |
| variant="primary", | |
| interactive=True, | |
| ) | |
| with gr.Column(scale=2): | |
| output_files = gr.Files( | |
| label="Output", | |
| elem_id="file-list", | |
| size="lg", | |
| show_label=False, | |
| ) | |
| submit_button.click( | |
| checker, | |
| inputs=[url_name, media_types], | |
| outputs=[output_files], | |
| ) | |
| app.launch() | |