herolib_python/lib/web/doctools/md_replacer.py

import sys
import os

# Add the parent directory of herotools to the Python module search path
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

from herotools.logger import logger
from markdown_it import MarkdownIt
from markdown_it.tree import SyntaxTreeNode
import re
from enum import Enum
from herotools.texttools import name_fix
from mdformat.renderer import MDRenderer
from urllib.parse import urlparse

class ImageType(Enum):
    JPEG = 'jpeg'
    PNG = 'png'
    GIF = 'gif'
    OTHER = 'other'


def get_link_page(prefix:str, linkname:str, sitename: str, name: str) -> str:
    """
    Generates a page link based on sitename and name.

    Args:
        sitename (str): The name of the site.
        name (str): The name of the page.

    Returns:
        str: The generated link.
    """
    logger.debug(f"get_link_page: {prefix[:60]:<60} {linkname} {sitename}:{name}")
    return f"[{linkname}]({prefix}/{sitename}/{name})"

def get_link_image(prefix:str, sitename: str, name: str, image_type: ImageType) -> str:
    """
    Generates an image link based on the URL and image type.

    Args:
        url (str): The original URL of the image.
        image_type (ImageType): The type of the image.

    Returns:
        str: The generated link.
    """
    logger.debug(f"get_link_image: {prefix[:60]:<60} {sitename}:{name}")
    return f"![]({prefix}/{sitename}/{name})"

def get_include(sitename: str, name: str) -> str:
    """
    Generates an include directive link based on sitename and name.

    Args:
        sitename (str): The name of the site.
        name (str): The name of the page to include.

    Returns:
        str: The generated include directive.
    """
    logger.debug(f"get_include: {sitename}:{name}")
    return f"include: {sitename}/{name}"

def replace(prefix:str, markdown: str) -> str:
    """
    Finds all image links, markdown page links, and custom include directives in the provided markdown text
    and replaces them using the appropriate functions.

    Args:
        markdown (str): The markdown content.

    Returns:
        str: The modified markdown content with updated links.
    """
    # Initialize the Markdown parser
    md = MarkdownIt()
    tokens = md.parse(markdown)
    ast = SyntaxTreeNode(tokens)

    print(ast.pretty(indent=2, show_text=True))

    def process_node(node: SyntaxTreeNode):
        # from IPython import embed; embed()

        def get_new_url(url: str):
            logger.debug(f"url: {url}")

            parsed_url = urlparse(url)
            # site_name = parsed_url.netloc
            image_path = parsed_url.path
            logger.debug(f"parsed_url: {parsed_url}")

            # prefix = prefix.rstrip('/')
            # image_path = image_path.strip('/')

            new_url = f"{prefix.rstrip('/')}/{image_path.strip('/')}"
            logger.debug(f"new_url: {new_url}")

            return new_url

        if node.type == 'image':
            # Process image link
            url = node.attrs.get('src', '')
            new_url = get_new_url(url)
            node.attrs['src'] = new_url

        elif node.type == 'link':
            # Process markdown page link
            url = node.attrs.get('href', '')
            new_url = get_new_url(url)
            node.attrs['href'] = new_url

        # Recursively process child nodes
        for child in node.children or []:
            process_node(child)

    def replace_include_directives(match: re.Match) -> str:
        """
        Replaces custom include directives with appropriate links.

        Args:
            match (re.Match): The match object containing the found include directive.

        Returns:
            str: The generated link for the include directive.
        """
        url = match.group(1)
        if ':' in url:
            site_name, page = url.split(':', 1)
            page_name = page.split('/')[-1]
        else:
            site_name = ""
            page_name = url
        if not page.endswith('.md'):
            page += '.md'
        return get_include(prefix, site_name, page_name)


    # Process the root node
    process_node(ast)

    # Convert the AST back to markdown
    renderer = MDRenderer()
    options = {}
    env = {}
    rendered_markdown = renderer.render(tokens, options, env)

    # include_pattern = re.compile(r"!!include page:'(.*?)'")
    # rendered_markdown = include_pattern.sub(replace_include_directives, rendered_markdown)

    return rendered_markdown


if __name__ == "__main__":

    text = """
![Image description](https://example.com/image.png)
[Page link](sitename:some/path/to/page.md)
!!include page:'mypage'
!!include page:'mypage.md'
!!include page:'mysite:mypage
!!include page:'mysite:mypage'
!!include page:'mysite:mypage.md'
    """

    print(text)
    text2=replace("http://localhost:8080/pre/", text)
    print(text2)