heroweb/lib/tools/templatefixer.py

import os
import hashlib
from bs4 import BeautifulSoup
import shutil
from urllib.parse import urlparse
import requests
from tools.extensions import check_and_add_extension
from tools.deduper import Deduper
from typing import Dict
from colorama import Fore, init
import redis

image_movie_extensions = ('.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', 
                        '.mp3', '.mp4', '.avi', '.mov', '.wmv', '.flv', '.webm')


#import pudb; pudb.set_trace()

class HTMLTemplateConverter:
    def __init__(self, src_dir: str, dest_dir: str, static_dir:str = "" ,reset : bool = False):
        self.src_dir = src_dir
        self.dest_dir = dest_dir
        if static_dir=="":
            static_dir = f"{dest_dir}/static"        
        
        if reset and os.path.exists(self.dest_dir):
            print(" - reset")
            shutil.rmtree(self.dest_dir)
                    
        self.deduper_static = Deduper(static_dir)
                
        if reset:
            self.deduper_static.load_assets()        
            
        self.redis_client = redis.StrictRedis(host="localhost", port=6379, db=0)
        self.cache_expiration = 3600  # 1 hour            

    def download_file(self, myurl :str , remove:bool=False) -> str:
        # Check if the file is already in Redis cache
        key = f"web.download.{myurl}"
        cached_path = self.redis_client.get(key)
        if cached_path:
            print(f" - download cached {myurl}")
            temp_path = cached_path.decode('utf-8')
        else:
            print(f" - download {myurl}")
            response = requests.get(myurl)
            if response.status_code == 200:
                if "?" in myurl:
                    local_filename = hashlib.md5(myurl.encode('utf-8')).hexdigest()
                else:
                    url_path = urlparse(myurl).path
                    base_name, extension = os.path.splitext(os.path.basename(url_path))
                    local_filename = base_name + extension

                # Download to temporary directory
                temp_dir = os.path.join("/tmp/files")
                os.makedirs(temp_dir, exist_ok=True)
                temp_path = os.path.join(temp_dir, local_filename)

                with open(temp_path, 'wb') as f:
                    f.write(response.content)

                # Update Redis cache
                self.redis_client.setex(key, self.cache_expiration, temp_path)
            else:
                raise Exception(f"ERROR: failed to download {myurl}")        
        if remove:
            os.remove(temp_path)
            self.redis_client.delete(key)
        return temp_path

    def add_to_static(self, file_path: str, dest_dir_rel:str = "") -> str:
        """
        add path to the static directory
        returns the path as need to be used in the template for the file link
        """
        # Check if the file path exists
        if not os.path.exists(file_path):
            file_path2=f"{self.src_dir}/{file_path}"
            if not os.path.exists(file_path2):
                print(f"{Fore.RED}ERROR: File '{file_path}' or '{file_path2}' does not exist.{Fore.RESET}")
                # raise FileNotFoundError(f"File '{file_path}' and {file_path2} does not exist.")
                return f"error/{file_path2}"
            else:
                file_path = file_path2

        # Calculate hash for the file to be added
        file_dedupe_location = self.deduper_static.path_check(file_path)
        if file_dedupe_location:
            return file_dedupe_location
        return self.deduper_static.add(source_path=file_path,dest_dir_rel=dest_dir_rel)

    def add_file(self, src_file_path:str, file_path: str, remove : bool = False, dest_dir_rel :str = "") -> str:
        print (f" - addfile {file_path} for dest_dir_rel:{dest_dir_rel}\n    from out of file: {src_file_path}")
        
        if file_path.startswith('http://') or file_path.startswith('https://'):

                try:
                    temp_path=self.download_file(file_path)
                except:
                    print(f"{Fore.RED}ERROR DOWNLOAD: File '{file_path}'.{Fore.RESET}")
                    return f"error/download/{file_path}"
                
                #import pudb; pudb.set_trace()
                # from IPython import embed;embed()
                # s
                
                
                src_file_path = ""
                r = self.add_file(src_file_path, temp_path,remove=True,dest_dir_rel=dest_dir_rel)            
                return r
                
        else:

            if not os.path.exists(file_path):
                #now we need to go relative in relation to the src_file_path
                file_path2 = os.path.abspath(os.path.join(os.path.dirname(src_file_path), file_path))
                if os.path.exists(file_path2):
                    file_path = file_path2
                else: 
                    print(f"{Fore.RED}ERROR: File '{file_path}' or `{file_path2}` does not exist.{Fore.RESET}")
                    return f"error/{file_path}"
                    # raise FileNotFoundError(f"File '{file_path}' or `{file_path2}` does not exist.")        

            # Check if file exists inself.deduper
            existing_path = self.deduper_static.path_check(file_path)
            if existing_path:
                return existing_path
            
            return self.add_to_static(file_path,dest_dir_rel=dest_dir_rel)        
        

    def convert(self) -> None:

        os.makedirs(self.dest_dir, exist_ok=True)
        
        for root, _, files in os.walk(self.src_dir):
            for file in files:
                if file.endswith('.html'):
                    src_file_path = os.path.abspath(os.path.join(root, file))
                    rel_path = os.path.relpath(src_file_path, self.src_dir)

                    dest_file_path = os.path.join(self.dest_dir, rel_path)
                    os.makedirs(os.path.dirname(dest_file_path), exist_ok=True)
                    with open(src_file_path, 'r', encoding='utf-8') as html_file:
                        html_content = html_file.read()

                    soup = BeautifulSoup(html_content, 'html.parser')

                    svg_elements = soup.find_all('svg')
                    
                    for i, svg in enumerate(svg_elements, start=1):
                        svg_file_path = "/tmp/my.svg"
                        with open(svg_file_path, 'w', encoding='utf-8') as svg_file:
                            svg_file.write(str(svg))
                            
                        svg_path = self.add_file(src_file_path, file_path=svg_file_path,dest_dir_rel="svg")
                        svg_path_full = f"{self.deduper_static.path}/{svg_path}"
                        
                        if not os.path.exists(svg_path_full):
                            raise Exception(f"file svg does not exist {svg_path_full}")
     
                        svg_file_path2 = os.path.join(self.dest_dir, svg_path)    
                        
                        # from IPython import embed;embed()
                        # s
                                                               
                        if not (os.path.exists(svg_file_path2) or os.path.islink(svg_file_path2)):
                            os.makedirs(os.path.dirname(svg_file_path2), exist_ok=True)
                            svg_path_full = os.path.abspath(svg_path_full)
                            svg_file_path2 = os.path.abspath(svg_file_path2)                                
                            print(f" - symlink: {svg_path_full} {svg_file_path2}")
                            os.symlink(svg_path_full, svg_file_path2)      
                                                                                                                  
                        svg.replace_with(f"{{% include '{svg_path}' %}}")
                        
                        os.remove(svg_file_path)
                                                
                        
                    for link in soup.find_all('link', href=True):
                        href = link['href']
                        base_href = href.split('?')[0] if '?' in href else href
                        if base_href.endswith('.css'):                            
                            new_href = self.add_file(src_file_path,base_href,dest_dir_rel="css")
                            link['href'] = f"/static/{new_href}"
                        else:
                            # Check if base_href is an image or movie file
                            if base_href.lower().endswith(image_movie_extensions):
                                new_src = self.add_file(src_file_path, base_href, dest_dir_rel="img")
                                # Assuming the original attribute was 'src' for images/movies
                            else:
                                # Handle other types of files or links here if needed
                                if href.startswith('http://') or href.startswith('https://'):
                                    new_src = self.add_file(src_file_path,href)
                                else:
                                    new_src = self.add_file(src_file_path,base_href)
                                # from IPython import embed;embed()
                                # s
                            if link.has_key("src"):
                                link['src'] = f"/static/{new_src}"
                            elif link.has_key("href"):
                                link['href'] = f"/static/{new_src}"     
                            # if "pro-tailwind.min" in href:
                            #     from IPython import embed;embed()
                            #     w

                                
                    for script in soup.find_all('script', src=True):
                        src = script['src']
                        src_href = src.split('?')[0] if '?' in src else src                        
                        if src_href.endswith('.js'):
                            new_src = self.add_file(src_file_path,src_href,dest_dir_rel="js")
                            script['src'] = f"/static/{new_src}"

                    for img in soup.find_all('img', src=True):
                        src = img['src']
                        new_src = self.add_file(src_file_path,src,dest_dir_rel="img")
                        img['src'] = f"/static/{new_src}"

                    jinja_template = str(soup.prettify())

                    with open(dest_file_path, 'w', encoding='utf-8') as dest_file:
                        dest_file.write(jinja_template)


# Example usage
# 
# converter = HTMLTemplateConverter("source_directory", "destination_directory")
# converter.convert_html_to_jinja()

def new(src_dir: str, dest_dir: str, static_dir:str = "",reset : bool = False) -> HTMLTemplateConverter:
    f =  HTMLTemplateConverter(src_dir,dest_dir,static_dir=static_dir,reset=reset)
    f.convert()
    return f
... 2024-08-22 10:09:10 +00:00			`import os`
			`import hashlib`
			`from bs4 import BeautifulSoup`
			`import shutil`
			`from urllib.parse import urlparse`
			`import requests`
			`from tools.extensions import check_and_add_extension`
			`from tools.deduper import Deduper`
			`from typing import Dict`
... 2024-09-02 05:28:06 +00:00			`from colorama import Fore, init`
			`import redis`

			`image_movie_extensions = ('.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp',`
			`'.mp3', '.mp4', '.avi', '.mov', '.wmv', '.flv', '.webm')`

... 2024-08-22 10:09:10 +00:00
... 2024-09-01 18:00:13 +00:00			`#import pudb; pudb.set_trace()`

... 2024-08-22 10:09:10 +00:00			`class HTMLTemplateConverter:`
... 2024-09-01 18:00:13 +00:00			`def __init__(self, src_dir: str, dest_dir: str, static_dir:str = "" ,reset : bool = False):`
... 2024-08-22 10:09:10 +00:00			`self.src_dir = src_dir`
			`self.dest_dir = dest_dir`
... 2024-09-01 18:00:13 +00:00			`if static_dir=="":`
			`static_dir = f"{dest_dir}/static"`
... 2024-08-22 10:09:10 +00:00
			`if reset and os.path.exists(self.dest_dir):`
			`print(" - reset")`
			`shutil.rmtree(self.dest_dir)`
... 2024-09-01 18:00:13 +00:00
			`self.deduper_static = Deduper(static_dir)`
... 2024-08-22 10:09:10 +00:00
... 2024-09-01 18:00:13 +00:00			`if reset:`
			`self.deduper_static.load_assets()`
... 2024-09-02 05:28:06 +00:00
			`self.redis_client = redis.StrictRedis(host="localhost", port=6379, db=0)`
			`self.cache_expiration = 3600 # 1 hour`

			`def download_file(self, myurl :str , remove:bool=False) -> str:`
			`# Check if the file is already in Redis cache`
			`key = f"web.download.{myurl}"`
			`cached_path = self.redis_client.get(key)`
			`if cached_path:`
			`print(f" - download cached {myurl}")`
			`temp_path = cached_path.decode('utf-8')`
			`else:`
			`print(f" - download {myurl}")`
			`response = requests.get(myurl)`
			`if response.status_code == 200:`
			`if "?" in myurl:`
			`local_filename = hashlib.md5(myurl.encode('utf-8')).hexdigest()`
			`else:`
			`url_path = urlparse(myurl).path`
			`base_name, extension = os.path.splitext(os.path.basename(url_path))`
			`local_filename = base_name + extension`

			`# Download to temporary directory`
			`temp_dir = os.path.join("/tmp/files")`
			`os.makedirs(temp_dir, exist_ok=True)`
			`temp_path = os.path.join(temp_dir, local_filename)`

			`with open(temp_path, 'wb') as f:`
			`f.write(response.content)`

			`# Update Redis cache`
			`self.redis_client.setex(key, self.cache_expiration, temp_path)`
			`else:`
			`raise Exception(f"ERROR: failed to download {myurl}")`
			`if remove:`
			`os.remove(temp_path)`
			`self.redis_client.delete(key)`
			`return temp_path`
... 2024-09-01 18:00:13 +00:00
			`def add_to_static(self, file_path: str, dest_dir_rel:str = "") -> str:`
			`"""`
			`add path to the static directory`
			`returns the path as need to be used in the template for the file link`
			`"""`
... 2024-08-22 10:09:10 +00:00			`# Check if the file path exists`
			`if not os.path.exists(file_path):`
... 2024-09-01 18:00:13 +00:00			`file_path2=f"{self.src_dir}/{file_path}"`
			`if not os.path.exists(file_path2):`
... 2024-09-02 05:28:06 +00:00			`print(f"{Fore.RED}ERROR: File '{file_path}' or '{file_path2}' does not exist.{Fore.RESET}")`
			`# raise FileNotFoundError(f"File '{file_path}' and {file_path2} does not exist.")`
			`return f"error/{file_path2}"`
... 2024-09-01 18:00:13 +00:00			`else:`
			`file_path = file_path2`
... 2024-08-22 10:09:10 +00:00
			`# Calculate hash for the file to be added`
... 2024-09-01 18:00:13 +00:00			`file_dedupe_location = self.deduper_static.path_check(file_path)`
			`if file_dedupe_location:`
			`return file_dedupe_location`
			`return self.deduper_static.add(source_path=file_path,dest_dir_rel=dest_dir_rel)`
... 2024-08-22 10:09:10 +00:00
... 2024-09-02 05:28:06 +00:00			`def add_file(self, src_file_path:str, file_path: str, remove : bool = False, dest_dir_rel :str = "") -> str:`
			`print (f" - addfile {file_path} for dest_dir_rel:{dest_dir_rel}\n from out of file: {src_file_path}")`
... 2024-08-22 10:09:10 +00:00
			`if file_path.startswith('http://') or file_path.startswith('https://'):`
... 2024-09-02 05:28:06 +00:00
			`try:`
			`temp_path=self.download_file(file_path)`
			`except:`
			`print(f"{Fore.RED}ERROR DOWNLOAD: File '{file_path}'.{Fore.RESET}")`
			`return f"error/download/{file_path}"`
... 2024-08-22 10:09:10 +00:00
... 2024-09-01 18:00:13 +00:00			`#import pudb; pudb.set_trace()`
... 2024-09-02 05:28:06 +00:00			`# from IPython import embed;embed()`
			`# s`


			`src_file_path = ""`
			`r = self.add_file(src_file_path, temp_path,remove=True,dest_dir_rel=dest_dir_rel)`
... 2024-09-01 18:00:13 +00:00			`return r`
... 2024-08-22 10:09:10 +00:00
			`else:`

			`if not os.path.exists(file_path):`
... 2024-09-02 05:28:06 +00:00			`#now we need to go relative in relation to the src_file_path`
			`file_path2 = os.path.abspath(os.path.join(os.path.dirname(src_file_path), file_path))`
... 2024-09-01 18:00:13 +00:00			`if os.path.exists(file_path2):`
			`file_path = file_path2`
			`else:`
... 2024-09-02 05:28:06 +00:00			print(f"{Fore.RED}ERROR: File '{file_path}' or `{file_path2}` does not exist.{Fore.RESET}")
			`return f"error/{file_path}"`
			# raise FileNotFoundError(f"File '{file_path}' or `{file_path2}` does not exist.")
... 2024-08-22 10:09:10 +00:00
			`# Check if file exists inself.deduper`
... 2024-09-01 18:00:13 +00:00			`existing_path = self.deduper_static.path_check(file_path)`
... 2024-08-22 10:09:10 +00:00			`if existing_path:`
... 2024-09-01 18:00:13 +00:00			`return existing_path`
... 2024-08-22 10:09:10 +00:00
... 2024-09-01 18:00:13 +00:00			`return self.add_to_static(file_path,dest_dir_rel=dest_dir_rel)`
... 2024-08-22 10:09:10 +00:00

			`def convert(self) -> None:`

			`os.makedirs(self.dest_dir, exist_ok=True)`

			`for root, _, files in os.walk(self.src_dir):`
			`for file in files:`
			`if file.endswith('.html'):`
... 2024-09-02 05:28:06 +00:00			`src_file_path = os.path.abspath(os.path.join(root, file))`
			`rel_path = os.path.relpath(src_file_path, self.src_dir)`
... 2024-08-22 10:09:10 +00:00
... 2024-09-02 05:28:06 +00:00			`dest_file_path = os.path.join(self.dest_dir, rel_path)`
			`os.makedirs(os.path.dirname(dest_file_path), exist_ok=True)`
... 2024-08-22 10:09:10 +00:00			`with open(src_file_path, 'r', encoding='utf-8') as html_file:`
			`html_content = html_file.read()`

			`soup = BeautifulSoup(html_content, 'html.parser')`

			`svg_elements = soup.find_all('svg')`

			`for i, svg in enumerate(svg_elements, start=1):`
			`svg_file_path = "/tmp/my.svg"`
			`with open(svg_file_path, 'w', encoding='utf-8') as svg_file:`
			`svg_file.write(str(svg))`

... 2024-09-02 05:28:06 +00:00			`svg_path = self.add_file(src_file_path, file_path=svg_file_path,dest_dir_rel="svg")`
... 2024-09-01 18:00:13 +00:00			`svg_path_full = f"{self.deduper_static.path}/{svg_path}"`

			`if not os.path.exists(svg_path_full):`
			`raise Exception(f"file svg does not exist {svg_path_full}")`

			`svg_file_path2 = os.path.join(self.dest_dir, svg_path)`

			`# from IPython import embed;embed()`
			`# s`

			`if not (os.path.exists(svg_file_path2) or os.path.islink(svg_file_path2)):`
			`os.makedirs(os.path.dirname(svg_file_path2), exist_ok=True)`
			`svg_path_full = os.path.abspath(svg_path_full)`
			`svg_file_path2 = os.path.abspath(svg_file_path2)`
			`print(f" - symlink: {svg_path_full} {svg_file_path2}")`
			`os.symlink(svg_path_full, svg_file_path2)`

			`svg.replace_with(f"{{% include '{svg_path}' %}}")`
... 2024-08-22 10:09:10 +00:00
			`os.remove(svg_file_path)`
... 2024-09-01 18:00:13 +00:00
... 2024-08-22 10:09:10 +00:00
			`for link in soup.find_all('link', href=True):`
			`href = link['href']`
... 2024-09-02 05:28:06 +00:00			`base_href = href.split('?')[0] if '?' in href else href`
			`if base_href.endswith('.css'):`
			`new_href = self.add_file(src_file_path,base_href,dest_dir_rel="css")`
			`link['href'] = f"/static/{new_href}"`
			`else:`
			`# Check if base_href is an image or movie file`
			`if base_href.lower().endswith(image_movie_extensions):`
			`new_src = self.add_file(src_file_path, base_href, dest_dir_rel="img")`
			`# Assuming the original attribute was 'src' for images/movies`
			`else:`
			`# Handle other types of files or links here if needed`
			`if href.startswith('http://') or href.startswith('https://'):`
			`new_src = self.add_file(src_file_path,href)`
			`else:`
			`new_src = self.add_file(src_file_path,base_href)`
			`# from IPython import embed;embed()`
			`# s`
			`if link.has_key("src"):`
			`link['src'] = f"/static/{new_src}"`
			`elif link.has_key("href"):`
			`link['href'] = f"/static/{new_src}"`
			`# if "pro-tailwind.min" in href:`
			`# from IPython import embed;embed()`
			`# w`


... 2024-08-22 10:09:10 +00:00
			`for script in soup.find_all('script', src=True):`
			`src = script['src']`
... 2024-09-02 05:28:06 +00:00			`src_href = src.split('?')[0] if '?' in src else src`
			`if src_href.endswith('.js'):`
			`new_src = self.add_file(src_file_path,src_href,dest_dir_rel="js")`
			`script['src'] = f"/static/{new_src}"`
... 2024-08-22 10:09:10 +00:00
			`for img in soup.find_all('img', src=True):`
			`src = img['src']`
... 2024-09-02 05:28:06 +00:00			`new_src = self.add_file(src_file_path,src,dest_dir_rel="img")`
			`img['src'] = f"/static/{new_src}"`
... 2024-08-22 10:09:10 +00:00
			`jinja_template = str(soup.prettify())`

			`with open(dest_file_path, 'w', encoding='utf-8') as dest_file:`
			`dest_file.write(jinja_template)`


			`# Example usage`
			`#`
			`# converter = HTMLTemplateConverter("source_directory", "destination_directory")`
			`# converter.convert_html_to_jinja()`

... 2024-09-01 18:00:13 +00:00			`def new(src_dir: str, dest_dir: str, static_dir:str = "",reset : bool = False) -> HTMLTemplateConverter:`
			`f = HTMLTemplateConverter(src_dir,dest_dir,static_dir=static_dir,reset=reset)`
... 2024-08-22 10:09:10 +00:00			`f.convert()`
			`return f`