import os import hashlib from bs4 import BeautifulSoup import shutil from urllib.parse import urlparse import requests from tools.extensions import check_and_add_extension from tools.deduper import Deduper from typing import Dict class HTMLTemplateConverter: def __init__(self, src_dir: str, dest_dir: str, dedupe_dir:str,reset : bool = False): self.src_dir = src_dir self.dest_dir = dest_dir self.static_dir = f"{dest_dir}/static" self.dedupe_dir = dedupe_dir if reset and os.path.exists(self.dest_dir): print(" - reset") shutil.rmtree(self.dest_dir) self.deduper = Deduper(self.dedupe_dir) def add_to_static(self, file_path: str): # Check if the file path exists if not os.path.exists(file_path): file_path=f"{self.src_dir}/{file_path}" if not os.path.exists(file_path): raise FileNotFoundError(f"File '{file_path}' does not exist.") # Get filename and extension, lowercase the filename base_name, extension = os.path.splitext(os.path.basename(file_path)) base_name = base_name.lower() extension = extension.lower() # Initial path setup base_path = os.path.join(self.static_dir, base_name) new_path = f"{base_path}{extension}" # Calculate hash for the file to be added file_hash = self.deduper._calculate_md5(file_path) # Check if filename exists in the static_dir counter = 2 while os.path.exists(new_path): # Calculate hash for the existing file existing_file_hash = self.deduper._calculate_md5(new_path) # If hashes match, return the existing file if file_hash == existing_file_hash: return os.path.basename(new_path) # Update path for the next iteration new_path = f"{base_path}_{counter}{extension}" counter += 1 # Copy the file to the new unique path shutil.copy2(file_path, new_path) return os.path.basename(new_path) def add_file(self, file_path: str, remove : bool = False) -> str: #print (f" - addfile {file_path}") if file_path.startswith('http://') or file_path.startswith('https://'): response = requests.get(file_path) if response.status_code == 200: url_path = urlparse(file_path).path base_name, extension = os.path.splitext(os.path.basename(url_path)) local_filename = base_name + extension # Download to temporary directory temp_dir = os.path.join("/tmp/files") os.makedirs(temp_dir, exist_ok=True) temp_path = os.path.join(temp_dir, local_filename) with open(temp_path, 'wb') as f: f.write(response.content) print(f" - download {file_path}") #temp_path = check_and_add_extension(temp_path) #import pudb; pudb.set_trace() return self.add_file(temp_path,remove=True) else: print(f"ERROR: failed to download {file_path}") return "" else: if not os.path.exists(file_path): file_path=f"{self.src_dir}/{file_path}" if not os.path.exists(file_path): raise FileNotFoundError(f"File '{file_path}' does not exist.") # Check if file exists inself.deduper existing_path = self.deduper.check_from_path(file_path) if existing_path: print(" - exists in dedupe pool") if remove: os.remove(temp_path) return f"/files/{existing_path}" #not in dedupe pool, copy to static dir static_path = self.add_to_static(file_path) return f"/static/{static_path}" def convert(self) -> None: os.makedirs(self.dest_dir, exist_ok=True) svgsdir = os.path.join(self.dest_dir, 'svgs') os.makedirs(svgsdir, exist_ok=True) static_dest_dir = os.path.join(self.dest_dir, 'static') os.makedirs(static_dest_dir, exist_ok=True) for root, _, files in os.walk(self.src_dir): for file in files: if file.endswith('.html'): src_file_path = os.path.join(root, file) dest_file_path = os.path.join(self.dest_dir, file) with open(src_file_path, 'r', encoding='utf-8') as html_file: html_content = html_file.read() soup = BeautifulSoup(html_content, 'html.parser') svg_elements = soup.find_all('svg') for i, svg in enumerate(svg_elements, start=1): svg_file_path = "/tmp/my.svg" with open(svg_file_path, 'w', encoding='utf-8') as svg_file: svg_file.write(str(svg)) existing_path = self.deduper.check_from_path(svg_file_path) if existing_path: svg_filename = f'svgs/{existing_path}' svg_filename0 = f"{self.dedupe_dir}/{existing_path}" print(f" - svg exists in dedupe pool: , '{svg_filename}'") svg_file_path_ = os.path.join(self.dedupe_dir, existing_path) svg_file_path2 = os.path.join(self.dest_dir, svg_filename) print(svg_file_path2) if not (os.path.exists(svg_file_path2) or os.path.islink(svg_file_path2)): os.makedirs(os.path.dirname(svg_filename), exist_ok=True) svg_file_path_ = os.path.abspath(svg_file_path_) svg_file_path2 = os.path.abspath(svg_file_path2) print(f" - symlink: {svg_file_path_} {svg_file_path2}") os.symlink(svg_file_path_, svg_file_path2) else: svg_filename = f'svgs/{file}_svg_{i}.svg' svg_file_path2 = os.path.join(self.dest_dir, svg_filename) shutil.copy(svg_file_path,svg_file_path2) print(f" - svg new {svg_filename}") os.remove(svg_file_path) svg.replace_with(f"{{% include '{svg_filename}' %}}") for link in soup.find_all('link', href=True): href = link['href'] if href.endswith('.css'): new_href = self.add_file(href) link['href'] = new_href for script in soup.find_all('script', src=True): src = script['src'] if src.endswith('.js'): new_src = self.add_file(src) script['src'] = new_src for img in soup.find_all('img', src=True): src = img['src'] new_src = self.add_file(src) img['src'] = new_src jinja_template = str(soup.prettify()) with open(dest_file_path, 'w', encoding='utf-8') as dest_file: dest_file.write(jinja_template) # Example usage # # converter = HTMLTemplateConverter("source_directory", "destination_directory") # converter.convert_html_to_jinja() def new(src_dir: str, dest_dir: str, dedupe_dir:str,reset : bool = False) -> HTMLTemplateConverter: f = HTMLTemplateConverter(src_dir,dest_dir,dedupe_dir,reset=reset) f.convert() return f