...

2025-08-20 04:01:35 +02:00
parent 749c89aefc
commit 6b9f0cf291
32 changed files with 327 additions and 511 deletions
--- a/README.md
+++ b/README.md
@@ -1,2 +1,4 @@
 # herolib_python

+see also ~/code/git.ourworld.tf/tfgrid_research/tfdev
+has some usefull stuff as well
--- a/examples/scrapper/yew_docs.sh
+++ b/examples/scrapper/yew_docs.sh
@@ -1,86 +0,0 @@
-#!/bin/bash
-
-set -ex
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-cd "$SCRIPT_DIR"
-
-source ../../env.sh
-
-cd "$SCRIPT_DIR"
-
-# 1. Install dependencies
-uv pip install --upgrade scrapy markdownify
-#!/bin/bash
-
-set -e
-
-# Ensure clean environment
-rm -rf yew_docs output
-
-# 1. Install required packages
-uv pip install --upgrade scrapy markdownify
-
-# 2. Create Scrapy project
-scrapy startproject yew_docs
-cd yew_docs
-
-# 3. Update settings to ignore robots.txt and set export directory
-echo -e "\nROBOTSTXT_OBEY = False" >> yew_docs/settings.py
-
-# 4. Create Spider with filters
-cat << 'EOF' > yew_docs/spiders/yew_docs_spider.py
-import os
-import scrapy
-from urllib.parse import urlparse, urljoin
-import markdownify
-
-class YewDocsSpider(scrapy.Spider):
-    name = "yew_docs"
-    allowed_domains = ["yew.rs"]
-    start_urls = ["https://yew.rs/docs/getting-started/introduction"]
-
-    def parse(self, response):
-        # Extract title
-        title = response.css("title::text").get() or "Page"
-
-        # Extract main content
-        main = response.css("main").get()
-        if not main:
-            self.logger.warning(f"No main content at {response.url}")
-            return
-
-        # Convert to Markdown
-        md = markdownify.markdownify(main, heading_style="ATX")
-
-        # Construct clean file path
-        parsed = urlparse(response.url)
-        path = parsed.path.lstrip("/")
-        if path.endswith("/") or path == "":
-            path += "index"
-        filepath = os.path.join("output", f"{path}.md")
-
-        os.makedirs(os.path.dirname(filepath), exist_ok=True)
-        with open(filepath, "w", encoding="utf-8") as f:
-            f.write(f"# {title.strip()}\n\n{md}")
-
-        # Follow only clean internal links under /docs/
-        for href in response.css("a::attr(href)").getall():
-            link = urljoin(response.url, href)
-            parsed = urlparse(link)
-            path = parsed.path
-
-            if parsed.netloc == "yew.rs" and path.startswith("/docs/"):
-                if (
-                    "/docs/0." in path or
-                    "/docs/next" in path or
-                    "/docs/en" in path or
-                    "#" in parsed.fragment or
-                    path.count("/") > 5
-                ):
-                    continue
-                yield scrapy.Request(link.split("#")[0], callback=self.parse)
-EOF
-
-# 5. Run the spider
-scrapy crawl yew_docs
--- a/herolib.egg-info/PKG-INFO
+++ b/herolib.egg-info/PKG-INFO
@@ -0,0 +1,12 @@
+Metadata-Version: 2.4
+Name: herolib
+Version: 0.1.0
+Summary: A Python library for HeroCode
+Author-email: Kilo Code <kilo.code@example.com>
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+
+# herolib_python
+
+see also ~/code/git.ourworld.tf/tfgrid_research/tfdev
+has some usefull stuff as well
--- a/herolib.egg-info/SOURCES.txt
+++ b/herolib.egg-info/SOURCES.txt
@@ -0,0 +1,78 @@
+README.md
+pyproject.toml
+herolib.egg-info/PKG-INFO
+herolib.egg-info/SOURCES.txt
+herolib.egg-info/dependency_links.txt
+herolib.egg-info/top_level.txt
+lib/__init__.py
+lib/clients/__init__.py
+lib/clients/assemblyai/__init__.py
+lib/clients/assemblyai/client.py
+lib/clients/stellar/__init__.py
+lib/clients/stellar/horizon.py
+lib/clients/stellar/model.py
+lib/clients/stellar/testnet.py
+lib/clients/telegram/__init__.py
+lib/clients/telegram/bot.py
+lib/clients/telegram/bot_audio.py
+lib/clients/telegram/bot_text.py
+lib/clients/telegram/errorqueue.py
+lib/clients/vimeo/__init__.py
+lib/clients/vimeo/client.py
+lib/clients/vimeo/model_video.py
+lib/clients/whisper/__init__.py
+lib/clients/whisper/convert.py
+lib/clients/whisper/whisper.py
+lib/clients/wireless/__init__.py
+lib/clients/wireless/wigle_net.py
+lib/core/__init__.py
+lib/core/heroscript/__init__.py
+lib/core/heroscript/heroaction.py
+lib/core/heroscript/heroscripts.py
+lib/core/heroscript/mixin.py
+lib/core/heroscript/tools.py
+lib/core/heroscript/examples/__init__.py
+lib/core/heroscript/examples/heroscript_example.py
+lib/core/heroscript/examples/heroscript_example2.py
+lib/core/heroscript/examples/wiki/__init__.py
+lib/core/heroscript/examples/wiki/sub/__init__.py
+lib/core/logger/__init__.py
+lib/core/logger/factory.py
+lib/core/logger/log.py
+lib/core/logger/log_test.py
+lib/core/logger/model.py
+lib/core/logger/search.py
+lib/core/loghandler/__init__.py
+lib/core/loghandler/mylogging.py
+lib/core/pathlib/__init__.py
+lib/core/pathlib/pathlib.py
+lib/core/texttools/__init__.py
+lib/core/texttools/texttools.py
+lib/crypt/__init__.py
+lib/crypt/box/__init__.py
+lib/crypt/box/box.py
+lib/crypt/box/box_api.py
+lib/data/__init__.py
+lib/data/ourtime/__init__.py
+lib/data/ourtime/ourtime.py
+lib/downloader/__init__.py
+lib/downloader/scrape_dynamic/dynamic_crawl.py
+lib/downloader/scrape_scapegraph/main.py
+lib/downloader/scrape_scapegraph/scrape.py
+lib/downloader/scrape_scapegraph/scrape_md.py
+lib/downloader/scrape_scapegraph/scrape_search.py
+lib/downloader/scrape_scapegraph/scrape_with_local_llm.py
+lib/downloader/scrape_scapegraph/scrape_with_local_llm_search.py
+lib/tools/__init__.py
+lib/tools/extensions.py
+lib/tools/gitscanner.py
+lib/tools/logger.py
+lib/tools/md5.py
+lib/tools/ourtime.py
+lib/tools/pathtools.py
+lib/tools/texttools.py
+lib/web/__init__.py
+lib/web/doctools/__init__.py
+lib/web/doctools/html_replacer.py
+lib/web/doctools/md_replacer.py
+lib/web/doctools/processor.py
--- a/herolib.egg-info/dependency_links.txt
+++ b/herolib.egg-info/dependency_links.txt
@@ -0,0 +1 @@
+
--- a/herolib.egg-info/top_level.txt
+++ b/herolib.egg-info/top_level.txt
@@ -0,0 +1 @@
+lib
--- a/lib/init.py
+++ b/lib/init.py
--- a/lib/clients/init.py
+++ b/lib/clients/init.py
--- a/lib/clients/assemblyai/init.py
+++ b/lib/clients/assemblyai/init.py
--- a/lib/clients/vimeo/init.py
+++ b/lib/clients/vimeo/init.py
--- a/lib/clients/wireless/init.py
+++ b/lib/clients/wireless/init.py
--- a/lib/core/init.py
+++ b/lib/core/init.py
--- a/lib/core/heroscript/init.py
+++ b/lib/core/heroscript/init.py
--- a/lib/core/heroscript/examples/init.py
+++ b/lib/core/heroscript/examples/init.py
--- a/lib/core/heroscript/examples/wiki/init.py
+++ b/lib/core/heroscript/examples/wiki/init.py
--- a/lib/core/heroscript/examples/wiki/sub/init.py
+++ b/lib/core/heroscript/examples/wiki/sub/init.py
--- a/lib/core/logger/pycache/init.py
+++ b/lib/core/logger/pycache/init.py
--- a/lib/core/loghandler/init.py
+++ b/lib/core/loghandler/init.py
--- a/lib/core/loghandler/mylogging.py
+++ b/lib/core/loghandler/mylogging.py
@@ -0,0 +1,214 @@
+from peewee import *
+import time
+from datetime import datetime
+from typing import Optional, List, Dict, Any, Iterable, Union
+import os
+import logging
+import traceback
+
+# Configure database path
+DB_DIR = os.path.expanduser('~/hero/var/logdb/')
+DB_FILE = os.path.join(DB_DIR, 'logs.db')
+
+# Create directory if it doesn't exist
+os.makedirs(DB_DIR, exist_ok=True)
+
+# Initialize database
+database = SqliteDatabase(DB_FILE, pragmas={'journal_mode': 'wal'})
+
+class BaseModel(Model):
+    """Base model class for Peewee."""
+    class Meta:
+        database = database
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert model instance to dictionary."""
+        data = {}
+        for field_name in self._meta.fields:
+            field_value = getattr(self, field_name)
+            if field_name in ('time', 'last_seen') and isinstance(field_value, int):
+                # Convert epoch to a readable format for the frontend
+                data[field_name] = datetime.fromtimestamp(field_value).strftime('%d-%m %H:%M')
+            else:
+                data[field_name] = field_value
+        return data
+
+class Log(BaseModel):
+    """Model for INFO logs."""
+    time = IntegerField(default=lambda: int(time.time()), index=True)
+    email = CharField(max_length=255, null=True)
+    logmsg = TextField()
+    level = IntegerField(default=100)
+    cat = CharField(max_length=100, index=True, default="general")
+    payload = TextField(null=True)
+    payload_cat = CharField(max_length=100, null=True)
+
+    class Meta:
+        table_name = 'logs'
+
+class Error(BaseModel):
+    """Model for ERROR logs."""
+    time = IntegerField(default=lambda: int(time.time()), index=True)
+    last_seen = IntegerField(default=lambda: int(time.time()), index=True)
+    email = CharField(max_length=255, null=True)
+    logmsg = TextField()
+    stacktrace = TextField(null=True)
+    count = IntegerField(default=1)
+    cat = CharField(max_length=100, index=True, default="general")
+    payload = TextField(null=True)
+    payload_cat = CharField(max_length=100, null=True)
+
+    class Meta:
+        table_name = 'errors'
+
+def init_db_logging():
+    """Create tables if they don't exist."""
+    with database:
+        database.create_tables([Log, Error], safe=True)
+
+class DatabaseLogHandler(logging.Handler):
+    """A logging handler that writes logs to the Peewee database."""
+    def emit(self, record):
+        stacktrace = None
+        if record.exc_info:
+            stacktrace = logging.Formatter().formatException(record.exc_info)
+
+        if record.levelno >= logging.ERROR:
+            log_error(
+                msg=record.getMessage(),
+                cat=record.name,
+                stacktrace=stacktrace
+            )
+        else:
+            log_info(
+                msg=record.getMessage(),
+                level=record.levelno,
+                cat=record.name
+            )
+
+def log_error(msg: str, cat: str = "general", email: Optional[str] = None, stacktrace: Optional[str] = None, payload: Optional[str] = None, payload_cat: Optional[str] = None):
+    """Log an ERROR message to the database, handling duplicates."""
+    try:
+        log_info(msg=msg, cat=cat, email=email, payload=payload, payload_cat=payload_cat)
+    except Exception as e:
+        pass
+    try:
+        if not stacktrace:
+            # Capture the current stack trace if not provided
+            stacktrace = "".join(traceback.format_stack())
+
+        # Filter out irrelevant lines from the stack trace
+        if stacktrace:
+            lines = stacktrace.split('\n')
+            filtered_lines = [
+                line for line in lines
+                if 'python3.13/logging' not in line and 'src/mylogging.py' not in line
+            ]
+            stacktrace = '\n'.join(filtered_lines)
+
+        one_day_ago = int(time.time()) - (24 * 3600)
+        
+        # Look for a similar error in the last 24 hours from the same user
+        existing_error = Error.select().where(
+            (Error.logmsg == msg) &
+            (Error.email == email) &
+            (Error.last_seen >= one_day_ago)
+        ).first()
+
+        if existing_error:
+            # If found, increment counter and update last_seen
+            existing_error.count += 1
+            existing_error.last_seen = int(time.time())
+            existing_error.stacktrace = stacktrace
+            existing_error.save()
+            print(existing_error)
+        else:
+            # Otherwise, create a new error record
+            Error.create(
+                logmsg=msg,
+                cat=cat,
+                email=email,
+                stacktrace=stacktrace,
+                payload=payload,
+                payload_cat=payload_cat
+            )
+            logging.info(f"Successfully logged new error: {msg}")
+
+    except Exception as e:
+        logging.error(f"Failed to log error to {DB_FILE}: {e}")
+
+def log_info(msg: str, level: int = 0, cat: str = "general", email: Optional[str] = None, payload: Optional[str] = None, payload_cat: Optional[str] = None):
+    """Log an INFO message to the database."""
+    try:
+        Log.create(logmsg=msg, level=level, cat=cat, email=email, payload=payload, payload_cat=payload_cat)
+    except Exception as e:
+        print(f"Failed to log info to {DB_FILE}: {e}")
+
+def get_errors(search: Optional[str] = None, cat: Optional[str] = None) -> List[Dict[str, Any]]:
+    """Get errors from the database with optional filters. Category search is prefix-based."""
+    query = Error.select().order_by(Error.last_seen.desc())
+    if search:
+        query = query.where(Error.logmsg.contains(search))
+    if cat and cat.strip():
+        query = query.where(Error.cat.startswith(cat.strip()))
+    return [e.to_dict() for e in query]
+
+def get_logs(
+    search: Optional[str] = None,
+    cat: Optional[str] = None,
+    level: Optional[int] = None,
+    hours_ago: Optional[int] = None,
+) -> List[Dict[str, Any]]:
+    """Get logs from the database with optional filters. Category search is prefix-based."""
+    query = Log.select().order_by(Log.time.desc())
+    
+    if search and search.strip():
+        query = query.where(Log.logmsg.contains(search))
+
+    if cat and cat.strip():
+        query = query.where(Log.cat.startswith(cat.strip()))
+
+    if level is not None:
+        query = query.where(Log.level <= level)
+
+    if hours_ago is not None:
+        time_ago = int(time.time()) - (hours_ago * 3600)
+        query = query.where(Log.time >= time_ago)
+    
+    return [l.to_dict() for l in query]
+
+def get_log_by_id(log_id: int) -> Optional[Dict[str, Any]]:
+    """Get a single log by its ID."""
+    try:
+        log = Log.get_by_id(log_id)
+        return log.to_dict()
+    except Log.DoesNotExist:
+        return None
+
+def delete_logs_older_than(minutes: int):
+    """Delete logs older than a specified number of minutes."""
+    time_ago = int(time.time()) - (minutes * 60)
+    Log.delete().where(Log.time < time_ago).execute()
+
+def delete_errors_older_than(minutes: int):
+    """Delete errors older than a specified number of minutes."""
+    time_ago = int(time.time()) - (minutes * 60)
+    Error.delete().where(Error.time < time_ago).execute()
+
+def get_unique_log_categories() -> List[str]:
+    """Get unique log categories from the database."""
+    query = (Log
+             .select(Log.cat)
+             .where(Log.cat.is_null(False))
+             .distinct()
+             .order_by(Log.cat))
+    return [l.cat for l in query]
+
+def get_unique_error_categories() -> List[str]:
+    """Get unique error categories from the database."""
+    query = (Error
+             .select(Error.cat)
+             .where(Error.cat.is_null(False))
+             .distinct()
+             .order_by(Error.cat))
+    return [e.cat for e in query]
--- a/lib/core/pathlib/pycache/init.py
+++ b/lib/core/pathlib/pycache/init.py
--- a/lib/crypt/init.py
+++ b/lib/crypt/init.py
--- a/lib/crypt/box/init.py
+++ b/lib/crypt/box/init.py
--- a/lib/data/init.py
+++ b/lib/data/init.py
--- a/lib/data/ourtime/pycache/init.py
+++ b/lib/data/ourtime/pycache/init.py
--- a/lib/downloader/init.py
+++ b/lib/downloader/init.py
--- a/lib/downloader/downloader.py
+++ b/lib/downloader/downloader.py
@@ -1,412 +0,0 @@
-import json
-import logging
-import mimetypes  # Added
-import os
-from datetime import datetime, timedelta
-from urllib.parse import urljoin, urlparse
-
-import scrapy
-from scrapy.crawler import CrawlerProcess
-from scrapy.linkextractors import LinkExtractor
-from scrapy.utils.project import get_project_settings
-
-# Configure logging
-logging.basicConfig(
-    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
-)
-logger = logging.getLogger(__name__)
-
-STATE_FILE_NAME = ".download_state.json"
-
-
-class GenericDownloaderSpider(scrapy.Spider):
-    name = "generic_downloader"
-
-    def __init__(
-        self,
-        start_url,
-        dest_dir,
-        allowed_domains,
-        ignore_paths=None,
-        depth_limit=0,
-        follow_links=True,
-        max_age_hours=0,
-        state_data=None,
-        *args,
-        **kwargs,
-    ):
-        super(GenericDownloaderSpider, self).__init__(*args, **kwargs)
-        self.start_urls = [start_url]
-        self.dest_dir = dest_dir
-        self.allowed_domains = allowed_domains
-        self.ignore_paths = ignore_paths if ignore_paths else []
-        self.depth_limit = int(depth_limit)
-        self.follow_links = bool(follow_links)
-        self.max_age_hours = int(max_age_hours)
-        self.state_data = state_data if state_data else {}
-        self.link_extractor = LinkExtractor(allow_domains=self.allowed_domains)
-
-        os.makedirs(self.dest_dir, exist_ok=True)
-        logger.info(f"Downloader initialized for {start_url}")
-        logger.info(f"Destination directory: {self.dest_dir}")
-        logger.info(f"Allowed domains: {self.allowed_domains}")
-        logger.info(f"Ignore paths: {self.ignore_paths}")
-        logger.info(f"Depth limit: {self.depth_limit}")
-        logger.info(f"Follow links: {self.follow_links}")
-        logger.info(f"Max age (hours): {self.max_age_hours}")
-
-    def _should_ignore(self, url_path):
-        for pattern in self.ignore_paths:
-            if pattern in url_path:  # Simple substring match for now, can be regex
-                return True
-        return False
-
-    def _get_file_path(self, response):  # Changed signature to take response
-        url = response.url
-        parsed_url = urlparse(url)
-        original_path = parsed_url.path  # e.g. /foo/bar.html or /foo/ or /
-
-        # Determine base_name and current_ext from original_path
-        if original_path.endswith("/"):
-            base_name = "index"
-            current_ext = ""
-            # path_for_dirs is the path part that forms the directory structure
-            path_for_dirs = original_path.lstrip("/")
-        else:
-            path_basename = os.path.basename(original_path)
-            if (
-                not path_basename and original_path == "/"
-            ):  # Root path e.g. http://example.com
-                base_name = "index"
-                current_ext = ""
-            else:  # e.g. /file.txt or /file_no_ext or /.config
-                base_name, current_ext = os.path.splitext(path_basename)
-                if not base_name and current_ext:  # Hidden file like /.bashrc
-                    base_name = current_ext  # Treat .bashrc as base_name
-                    current_ext = ""  # No further extension part
-            path_for_dirs = os.path.dirname(original_path.lstrip("/"))
-
-        # Try to get extension from Content-Type
-        content_type = (
-            response.headers.get("Content-Type", b"")
-            .decode("utf-8")
-            .split(";")[0]
-            .strip()
-        )
-        mime_ext = mimetypes.guess_extension(content_type) if content_type else None
-
-        final_ext = current_ext
-        if mime_ext and not current_ext:  # No path extension, use MIME type's
-            final_ext = mime_ext
-        elif (
-            mime_ext
-            and current_ext.lower() in [".htm", ".html"]
-            and mime_ext
-            and mime_ext.lower() not in [".htm", ".html"]
-        ):
-            # Path had .html/.htm, but MIME type suggests something more specific
-            final_ext = mime_ext
-            logger.debug(
-                f"URL {url}: Path ext {current_ext} overridden by Content-Type ext {mime_ext}."
-            )
-        elif not final_ext and (
-            content_type.startswith("text/")
-            or content_type
-            in ["application/javascript", "application/json", "application/xml"]
-        ):
-            # Fallback for common text types if no extension determined yet and no path ext
-            if not base_name.endswith(
-                (".js", ".css", ".json", ".xml", ".txt")
-            ):  # Avoid double .html.html
-                final_ext = ".html"
-
-        filename = base_name + final_ext
-
-        # Create path components for the directory structure
-        components = []
-        if path_for_dirs:
-            components.extend(comp for comp in path_for_dirs.split("/") if comp)
-        components.append(filename)
-
-        # Sanitize components
-        sane_components = []
-        for comp_idx, comp_val in enumerate(components):
-            # Basic sanitization: replace invalid chars, limit length, avoid '..'
-            # Allow '.' for filenames but not as a full component name if it's not the only char
-            if comp_val == "..":
-                continue  # Skip parent dir references in path construction
-
-            sane_comp = "".join(
-                c if c.isalnum() or c in ["-", "_", "."] else "_" for c in comp_val
-            )
-            sane_comp = sane_comp[:150]  # Limit component length
-
-            if (
-                not sane_comp and comp_idx == len(components) - 1
-            ):  # last component (filename) became empty
-                sane_comp = "downloaded_file" + final_ext  # fallback filename
-            elif not sane_comp:
-                sane_comp = "_"  # placeholder for empty dir name
-
-            if sane_comp:  # Ensure component is not empty after sanitization
-                sane_components.append(sane_comp)
-
-        if not sane_components:  # If all components were sanitized away or skipped
-            sane_components = [filename if filename else "unknown_file" + final_ext]
-
-        file_path = os.path.join(self.dest_dir, *sane_components)
-        return file_path
-
-    def parse(self, response, depth=0):
-        url = response.url
-        logger.info(f"Processing URL (depth {depth}): {url}")
-
-        parsed_url = urlparse(url)
-        if self._should_ignore(parsed_url.path):
-            logger.info(f"Ignoring URL (matches ignore_paths): {url}")
-            return
-
-        file_path = self._get_file_path(response)  # Pass response object
-
-        # Check download state and max_age
-        if url in self.state_data:
-            url_state = self.state_data[url]
-            last_download_time_str = url_state.get("timestamp")
-            # Consider previous status; only skip if it was a success or another skip
-            can_skip_based_on_history = url_state.get("status", "").startswith(
-                "success"
-            ) or url_state.get("status", "").startswith("skipped")
-
-            if last_download_time_str and can_skip_based_on_history:
-                last_download_time = datetime.fromisoformat(last_download_time_str)
-                if self.max_age_hours > 0 and (
-                    datetime.utcnow() - last_download_time
-                ) < timedelta(hours=self.max_age_hours):
-                    logger.info(
-                        f"Skipping download for {url}, recently processed at {last_download_time_str} with status '{url_state.get('status')}'."
-                    )
-                    # Update state to reflect this skip check
-                    self.state_data[url]["status"] = "skipped_max_age"
-                    self.state_data[url]["skipped_timestamp"] = (
-                        datetime.utcnow().isoformat()
-                    )
-                    # Still need to check for links if recursive
-                    # Corrected depth condition:
-                    # Follow if self.depth_limit is 0 (infinite) OR current depth is less than a positive limit.
-                    if self.follow_links and (
-                        self.depth_limit == 0 or depth < self.depth_limit
-                    ):
-                        for link in self.link_extractor.extract_links(response):
-                            parsed_link_url = urlparse(link.url)
-                            if not self._should_ignore(parsed_link_url.path):
-                                yield response.follow(link, callback=self.parse)
-                            else:
-                                logger.info(
-                                    f"Ignoring extracted link (matches ignore_paths): {link.url}"
-                                )
-                    return
-
-        logger.info(f"Processing and saving {url} to {file_path}")
-        os.makedirs(os.path.dirname(file_path), exist_ok=True)
-
-        try:
-            with open(file_path, "wb") as f:
-                f.write(response.body)
-            logger.info(f"Successfully saved {url} to {file_path}")
-            self.state_data[url] = {
-                "timestamp": datetime.utcnow().isoformat(),
-                "status": "success",
-                "path": file_path,
-                "size": len(response.body),
-            }
-        except Exception as e:
-            logger.error(f"Failed to save {url} to {file_path}: {e}")
-            self.state_data[url] = {
-                "timestamp": datetime.utcnow().isoformat(),
-                "status": "failed",
-                "error": str(e),
-            }
-            return  # Do not proceed further if save failed
-
-        # Corrected depth condition for following links:
-        # Follow if self.depth_limit is 0 (infinite) OR current depth is less than a positive limit.
-        if self.follow_links and (self.depth_limit == 0 or depth < self.depth_limit):
-            logger.info(
-                f"Following links from {url} at custom depth {depth} (for next level {depth + 1})"
-            )
-            extracted_links = list(self.link_extractor.extract_links(response))
-            if not extracted_links:
-                logger.info(f"  No links extracted from {url} by LinkExtractor.")
-            else:
-                logger.info(
-                    f"  LinkExtractor found {len(extracted_links)} links from {url}: {[l.url for l in extracted_links]}"
-                )
-
-            for link_idx, link in enumerate(extracted_links):
-                logger.debug(
-                    f"  Considering link {link_idx + 1}/{len(extracted_links)}: Text='{link.text}', URL='{link.url}'"
-                )
-                parsed_link_url = urlparse(link.url)
-                if self._should_ignore(parsed_link_url.path):
-                    logger.info(
-                        f"  Ignoring extracted link (matches ignore_paths): {link.url}"
-                    )
-                else:
-                    logger.info(
-                        f"  Yielding request for: {link.url} (to be processed at custom depth {depth + 1})"
-                    )
-                    yield response.follow(link, callback=self.parse)
-
-    def closed(self, reason):
-        logger.info(f"Spider closed: {reason}. Finalizing and saving state.")
-        state_file_path = os.path.join(self.dest_dir, STATE_FILE_NAME)
-        try:
-            # Ensure the directory for the state file exists, though dest_dir should already.
-            os.makedirs(os.path.dirname(state_file_path), exist_ok=True)
-            with open(state_file_path, "w") as f:
-                json.dump(self.state_data, f, indent=4)
-            logger.info(
-                f"Spider successfully saved state ({len(self.state_data)} items) to {state_file_path}"
-            )
-        except Exception as e:
-            logger.error(
-                f"Spider failed to save state to {state_file_path}: {e}", exc_info=True
-            )
-
-
-def download_site(
-    start_url,
-    dest_dir,
-    recursive=True,
-    ignore_paths=None,
-    depth_limit=0,  # 0 means no limit if recursive is True
-    follow_links=True,  # This is somewhat redundant if recursive is True, but good for clarity
-    max_age_hours=24,  # Re-download if older than 24 hours
-):
-    """
-    Downloads a website or a single page.
-
-    :param start_url: The URL to start downloading from.
-    :param dest_dir: The directory to save downloaded files.
-    :param recursive: Whether to download recursively.
-    :param ignore_paths: A list of path substrings or regex patterns to ignore.
-    :param depth_limit: Maximum depth for recursive downloads (0 for no limit).
-    :param follow_links: Whether to follow links on pages (primarily for recursive).
-    :param max_age_hours: Max age of a file in hours. If a file was downloaded
-                          more recently than this, it won't be re-downloaded.
-                          0 means always re-download.
-    :return: A dictionary summarizing the download process.
-    """
-    parsed_url = urlparse(start_url)
-    if not parsed_url.scheme or not parsed_url.netloc:
-        logger.error(
-            f"Invalid start_url: {start_url}. Must be a full URL (e.g., http://example.com)"
-        )
-        return None
-
-    allowed_domains = [parsed_url.hostname]  # Changed from netloc to hostname
-
-    state_file_path = os.path.join(dest_dir, STATE_FILE_NAME)
-    state_data = {}
-    if os.path.exists(state_file_path):
-        try:
-            with open(state_file_path, "r") as f:
-                state_data = json.load(f)
-            logger.info(f"Loaded download state from {state_file_path}")
-        except json.JSONDecodeError:
-            logger.warning(
-                f"Could not decode JSON from state file {state_file_path}. Starting fresh."
-            )
-        except Exception as e:
-            logger.error(
-                f"Error loading state file {state_file_path}: {e}. Starting fresh."
-            )
-
-    settings = get_project_settings()
-    settings.set("ROBOTSTXT_OBEY", False)  # Explicitly disable robots.txt
-    # settings.set('LOG_LEVEL', 'DEBUG') # Optionally enable for more Scrapy internal logs
-
-    effective_scrapy_depth = 0  # Default for non-recursive or depth_limit=0 with recursion (0 means infinite for Scrapy)
-    if recursive and int(depth_limit) > 0:
-        effective_scrapy_depth = int(depth_limit)
-    # If not recursive, effective_scrapy_depth remains 0.
-    # If recursive and depth_limit is 0, effective_scrapy_depth remains 0 (infinite).
-    settings.set("DEPTH_LIMIT", effective_scrapy_depth)
-
-    logger.info(f"Scrapy DEPTH_LIMIT set to: {effective_scrapy_depth}")
-    # Scrapy's DEPTH_PRIORITY and SCHEDULER_DISK_QUEUE might be useful for large crawls
-    # For now, keeping it simple.
-
-    process = CrawlerProcess(settings)
-
-    # The spider needs to be instantiated with all its custom args
-    # Scrapy's process.crawl can take kwargs which are passed to the spider's __init__
-    process.crawl(
-        GenericDownloaderSpider,
-        start_url=start_url,
-        dest_dir=dest_dir,
-        allowed_domains=allowed_domains,
-        ignore_paths=ignore_paths,
-        depth_limit=int(depth_limit)
-        if recursive
-        else 0,  # Spider handles its own depth based on this
-        follow_links=follow_links and recursive,
-        max_age_hours=int(max_age_hours),
-        state_data=state_data,
-    )
-
-    logger.info(f"Starting download process for {start_url}...")
-    process.start()  # This will block until the crawl is finished
-
-    # The spider's closed() method is now responsible for writing the final state.
-    # Load this definitive state to build the summary.
-    final_state_data_for_summary = {}
-    if os.path.exists(state_file_path):
-        try:
-            with open(state_file_path, "r") as f:
-                final_state_data_for_summary = json.load(f)
-            logger.info(
-                f"Loaded final state ({len(final_state_data_for_summary)} items) from {state_file_path} for summary construction."
-            )
-        except json.JSONDecodeError as e:
-            logger.error(
-                f"Error decoding JSON from final state file {state_file_path} for summary: {e}. Summary will be based on empty or incomplete state."
-            )
-        except Exception as e:
-            logger.error(
-                f"Error loading final state from {state_file_path} for summary: {e}. Summary will be based on empty or incomplete state."
-            )
-    else:
-        logger.warning(
-            f"State file {state_file_path} not found after spider closed. Summary will be based on empty state."
-        )
-
-    summary = {
-        "start_url": start_url,
-        "dest_dir": dest_dir,
-        "total_processed_urls": len(final_state_data_for_summary),
-        "successful_downloads": 0,
-        "failed_downloads": 0,
-        "skipped_max_age": 0,
-        "total_bytes_downloaded": 0,
-        "state_file_path": state_file_path,
-        "errors": [],
-    }
-
-    # Populate summary from the final_state_data_for_summary loaded from the file
-    for url_key, data_val in final_state_data_for_summary.items():
-        status = data_val.get("status")
-        if status == "success":
-            summary["successful_downloads"] += 1
-            summary["total_bytes_downloaded"] += data_val.get("size", 0)
-        elif status == "failed":
-            summary["failed_downloads"] += 1
-            if "error" in data_val:
-                summary["errors"].append(f"URL: {url_key}, Error: {data_val['error']}")
-        elif status == "skipped_max_age":
-            summary["skipped_max_age"] += 1
-        # Any errors during state file loading for summary should also be noted if critical
-        # For now, the logs capture it. If final_state_data_for_summary is empty due to load error, summary will reflect that.
-
-    logger.info(f"Download process finished. Summary: {json.dumps(summary, indent=2)}")
-    return summary
--- a/lib/downloader/scrape_dynamic
+++ b/lib/downloader/scrape_dynamic
@@ -0,0 +1 @@
+../../../../tfgrid_research/tfdev/research/scrape_dynamic
--- a/lib/downloader/scrape_fast
+++ b/lib/downloader/scrape_fast
@@ -0,0 +1 @@
+../../../../tfgrid_research/tfdev/research/scrape_fast
--- a/lib/downloader/scrape_scapegraph
+++ b/lib/downloader/scrape_scapegraph
@@ -0,0 +1 @@
+../../../../tfgrid_research/tfdev/research/scrape_scapegraph
--- a/lib/web/init.py
+++ b/lib/web/init.py
--- a/lib/web/doctools/init.py
+++ b/lib/web/doctools/init.py
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,15 +1,18 @@
 [project]
-name = "Herolib"
-version = "0.9.0"
-description = "Lib from Hero's project for Actors"
-requires-python = ">=3.13"
-dependencies = [
-    "peewee>=3.17.0",
-    "pygments>=2.16.1",
-    "toml",
-    "requests>=2.31.0",
-    "beautifulsoup4>=4.12.2",
-    "pydantic>=2.8.0",
-    "scrapy==2.13.3",
-    "markdownify=1.1.0"
+name = "herolib"
+version = "0.1.0"
+description = "A Python library for HeroCode"
+authors = [
+    { name = "Kilo Code", email = "kilo.code@example.com" }
 ]
+readme = "README.md"
+requires-python = ">=3.8"
+dependencies = []
+
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["lib*"]
				`@@ -0,0 +1 @@`
				`../../../../tfgrid_research/tfdev/research/scrape_dynamic`