From 6b9f0cf29175cad48279827ce8d3076bcf852b1b Mon Sep 17 00:00:00 2001 From: despiegk Date: Wed, 20 Aug 2025 04:01:35 +0200 Subject: [PATCH] ... --- README.md | 2 + examples/scrapper/yew_docs.sh | 86 ---- herolib.egg-info/PKG-INFO | 12 + herolib.egg-info/SOURCES.txt | 78 ++++ herolib.egg-info/dependency_links.txt | 1 + herolib.egg-info/top_level.txt | 1 + lib/__init__.py | 0 lib/clients/__init__.py | 0 lib/clients/assemblyai/__init__.py | 0 lib/clients/vimeo/__init__.py | 0 lib/clients/wireless/__init__.py | 0 lib/core/__init__.py | 0 lib/core/heroscript/__init__.py | 0 lib/core/heroscript/examples/__init__.py | 0 lib/core/heroscript/examples/wiki/__init__.py | 0 .../heroscript/examples/wiki/sub/__init__.py | 0 lib/core/logger/__pycache__/__init__.py | 0 lib/core/loghandler/__init__.py | 0 lib/core/loghandler/mylogging.py | 214 +++++++++ lib/core/pathlib/__pycache__/__init__.py | 0 lib/crypt/__init__.py | 0 lib/crypt/box/__init__.py | 0 lib/data/__init__.py | 0 lib/data/ourtime/__pycache__/__init__.py | 0 lib/downloader/__init__.py | 0 lib/downloader/downloader.py | 412 ------------------ lib/downloader/scrape_dynamic | 1 + lib/downloader/scrape_fast | 1 + lib/downloader/scrape_scapegraph | 1 + lib/web/__init__.py | 0 lib/web/doctools/__init__.py | 0 pyproject.toml | 29 +- 32 files changed, 327 insertions(+), 511 deletions(-) delete mode 100755 examples/scrapper/yew_docs.sh create mode 100644 herolib.egg-info/PKG-INFO create mode 100644 herolib.egg-info/SOURCES.txt create mode 100644 herolib.egg-info/dependency_links.txt create mode 100644 herolib.egg-info/top_level.txt create mode 100644 lib/__init__.py create mode 100644 lib/clients/__init__.py create mode 100644 lib/clients/assemblyai/__init__.py create mode 100644 lib/clients/vimeo/__init__.py create mode 100644 lib/clients/wireless/__init__.py create mode 100644 lib/core/__init__.py create mode 100644 lib/core/heroscript/__init__.py create mode 100644 lib/core/heroscript/examples/__init__.py create mode 100644 lib/core/heroscript/examples/wiki/__init__.py create mode 100644 lib/core/heroscript/examples/wiki/sub/__init__.py create mode 100644 lib/core/logger/__pycache__/__init__.py create mode 100644 lib/core/loghandler/__init__.py create mode 100644 lib/core/loghandler/mylogging.py create mode 100644 lib/core/pathlib/__pycache__/__init__.py create mode 100644 lib/crypt/__init__.py create mode 100644 lib/crypt/box/__init__.py create mode 100644 lib/data/__init__.py create mode 100644 lib/data/ourtime/__pycache__/__init__.py create mode 100644 lib/downloader/__init__.py delete mode 100644 lib/downloader/downloader.py create mode 120000 lib/downloader/scrape_dynamic create mode 120000 lib/downloader/scrape_fast create mode 120000 lib/downloader/scrape_scapegraph create mode 100644 lib/web/__init__.py create mode 100644 lib/web/doctools/__init__.py diff --git a/README.md b/README.md index 7c1dc7d..83df8d1 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,4 @@ # herolib_python +see also ~/code/git.ourworld.tf/tfgrid_research/tfdev +has some usefull stuff as well \ No newline at end of file diff --git a/examples/scrapper/yew_docs.sh b/examples/scrapper/yew_docs.sh deleted file mode 100755 index 5775bd0..0000000 --- a/examples/scrapper/yew_docs.sh +++ /dev/null @@ -1,86 +0,0 @@ -#!/bin/bash - -set -ex - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -cd "$SCRIPT_DIR" - -source ../../env.sh - -cd "$SCRIPT_DIR" - -# 1. Install dependencies -uv pip install --upgrade scrapy markdownify -#!/bin/bash - -set -e - -# Ensure clean environment -rm -rf yew_docs output - -# 1. Install required packages -uv pip install --upgrade scrapy markdownify - -# 2. Create Scrapy project -scrapy startproject yew_docs -cd yew_docs - -# 3. Update settings to ignore robots.txt and set export directory -echo -e "\nROBOTSTXT_OBEY = False" >> yew_docs/settings.py - -# 4. Create Spider with filters -cat << 'EOF' > yew_docs/spiders/yew_docs_spider.py -import os -import scrapy -from urllib.parse import urlparse, urljoin -import markdownify - -class YewDocsSpider(scrapy.Spider): - name = "yew_docs" - allowed_domains = ["yew.rs"] - start_urls = ["https://yew.rs/docs/getting-started/introduction"] - - def parse(self, response): - # Extract title - title = response.css("title::text").get() or "Page" - - # Extract main content - main = response.css("main").get() - if not main: - self.logger.warning(f"No main content at {response.url}") - return - - # Convert to Markdown - md = markdownify.markdownify(main, heading_style="ATX") - - # Construct clean file path - parsed = urlparse(response.url) - path = parsed.path.lstrip("/") - if path.endswith("/") or path == "": - path += "index" - filepath = os.path.join("output", f"{path}.md") - - os.makedirs(os.path.dirname(filepath), exist_ok=True) - with open(filepath, "w", encoding="utf-8") as f: - f.write(f"# {title.strip()}\n\n{md}") - - # Follow only clean internal links under /docs/ - for href in response.css("a::attr(href)").getall(): - link = urljoin(response.url, href) - parsed = urlparse(link) - path = parsed.path - - if parsed.netloc == "yew.rs" and path.startswith("/docs/"): - if ( - "/docs/0." in path or - "/docs/next" in path or - "/docs/en" in path or - "#" in parsed.fragment or - path.count("/") > 5 - ): - continue - yield scrapy.Request(link.split("#")[0], callback=self.parse) -EOF - -# 5. Run the spider -scrapy crawl yew_docs diff --git a/herolib.egg-info/PKG-INFO b/herolib.egg-info/PKG-INFO new file mode 100644 index 0000000..c679a03 --- /dev/null +++ b/herolib.egg-info/PKG-INFO @@ -0,0 +1,12 @@ +Metadata-Version: 2.4 +Name: herolib +Version: 0.1.0 +Summary: A Python library for HeroCode +Author-email: Kilo Code +Requires-Python: >=3.8 +Description-Content-Type: text/markdown + +# herolib_python + +see also ~/code/git.ourworld.tf/tfgrid_research/tfdev +has some usefull stuff as well diff --git a/herolib.egg-info/SOURCES.txt b/herolib.egg-info/SOURCES.txt new file mode 100644 index 0000000..56326b2 --- /dev/null +++ b/herolib.egg-info/SOURCES.txt @@ -0,0 +1,78 @@ +README.md +pyproject.toml +herolib.egg-info/PKG-INFO +herolib.egg-info/SOURCES.txt +herolib.egg-info/dependency_links.txt +herolib.egg-info/top_level.txt +lib/__init__.py +lib/clients/__init__.py +lib/clients/assemblyai/__init__.py +lib/clients/assemblyai/client.py +lib/clients/stellar/__init__.py +lib/clients/stellar/horizon.py +lib/clients/stellar/model.py +lib/clients/stellar/testnet.py +lib/clients/telegram/__init__.py +lib/clients/telegram/bot.py +lib/clients/telegram/bot_audio.py +lib/clients/telegram/bot_text.py +lib/clients/telegram/errorqueue.py +lib/clients/vimeo/__init__.py +lib/clients/vimeo/client.py +lib/clients/vimeo/model_video.py +lib/clients/whisper/__init__.py +lib/clients/whisper/convert.py +lib/clients/whisper/whisper.py +lib/clients/wireless/__init__.py +lib/clients/wireless/wigle_net.py +lib/core/__init__.py +lib/core/heroscript/__init__.py +lib/core/heroscript/heroaction.py +lib/core/heroscript/heroscripts.py +lib/core/heroscript/mixin.py +lib/core/heroscript/tools.py +lib/core/heroscript/examples/__init__.py +lib/core/heroscript/examples/heroscript_example.py +lib/core/heroscript/examples/heroscript_example2.py +lib/core/heroscript/examples/wiki/__init__.py +lib/core/heroscript/examples/wiki/sub/__init__.py +lib/core/logger/__init__.py +lib/core/logger/factory.py +lib/core/logger/log.py +lib/core/logger/log_test.py +lib/core/logger/model.py +lib/core/logger/search.py +lib/core/loghandler/__init__.py +lib/core/loghandler/mylogging.py +lib/core/pathlib/__init__.py +lib/core/pathlib/pathlib.py +lib/core/texttools/__init__.py +lib/core/texttools/texttools.py +lib/crypt/__init__.py +lib/crypt/box/__init__.py +lib/crypt/box/box.py +lib/crypt/box/box_api.py +lib/data/__init__.py +lib/data/ourtime/__init__.py +lib/data/ourtime/ourtime.py +lib/downloader/__init__.py +lib/downloader/scrape_dynamic/dynamic_crawl.py +lib/downloader/scrape_scapegraph/main.py +lib/downloader/scrape_scapegraph/scrape.py +lib/downloader/scrape_scapegraph/scrape_md.py +lib/downloader/scrape_scapegraph/scrape_search.py +lib/downloader/scrape_scapegraph/scrape_with_local_llm.py +lib/downloader/scrape_scapegraph/scrape_with_local_llm_search.py +lib/tools/__init__.py +lib/tools/extensions.py +lib/tools/gitscanner.py +lib/tools/logger.py +lib/tools/md5.py +lib/tools/ourtime.py +lib/tools/pathtools.py +lib/tools/texttools.py +lib/web/__init__.py +lib/web/doctools/__init__.py +lib/web/doctools/html_replacer.py +lib/web/doctools/md_replacer.py +lib/web/doctools/processor.py \ No newline at end of file diff --git a/herolib.egg-info/dependency_links.txt b/herolib.egg-info/dependency_links.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/herolib.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/herolib.egg-info/top_level.txt b/herolib.egg-info/top_level.txt new file mode 100644 index 0000000..a65b417 --- /dev/null +++ b/herolib.egg-info/top_level.txt @@ -0,0 +1 @@ +lib diff --git a/lib/__init__.py b/lib/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lib/clients/__init__.py b/lib/clients/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lib/clients/assemblyai/__init__.py b/lib/clients/assemblyai/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lib/clients/vimeo/__init__.py b/lib/clients/vimeo/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lib/clients/wireless/__init__.py b/lib/clients/wireless/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lib/core/__init__.py b/lib/core/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lib/core/heroscript/__init__.py b/lib/core/heroscript/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lib/core/heroscript/examples/__init__.py b/lib/core/heroscript/examples/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lib/core/heroscript/examples/wiki/__init__.py b/lib/core/heroscript/examples/wiki/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lib/core/heroscript/examples/wiki/sub/__init__.py b/lib/core/heroscript/examples/wiki/sub/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lib/core/logger/__pycache__/__init__.py b/lib/core/logger/__pycache__/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lib/core/loghandler/__init__.py b/lib/core/loghandler/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lib/core/loghandler/mylogging.py b/lib/core/loghandler/mylogging.py new file mode 100644 index 0000000..ae42a45 --- /dev/null +++ b/lib/core/loghandler/mylogging.py @@ -0,0 +1,214 @@ +from peewee import * +import time +from datetime import datetime +from typing import Optional, List, Dict, Any, Iterable, Union +import os +import logging +import traceback + +# Configure database path +DB_DIR = os.path.expanduser('~/hero/var/logdb/') +DB_FILE = os.path.join(DB_DIR, 'logs.db') + +# Create directory if it doesn't exist +os.makedirs(DB_DIR, exist_ok=True) + +# Initialize database +database = SqliteDatabase(DB_FILE, pragmas={'journal_mode': 'wal'}) + +class BaseModel(Model): + """Base model class for Peewee.""" + class Meta: + database = database + + def to_dict(self) -> Dict[str, Any]: + """Convert model instance to dictionary.""" + data = {} + for field_name in self._meta.fields: + field_value = getattr(self, field_name) + if field_name in ('time', 'last_seen') and isinstance(field_value, int): + # Convert epoch to a readable format for the frontend + data[field_name] = datetime.fromtimestamp(field_value).strftime('%d-%m %H:%M') + else: + data[field_name] = field_value + return data + +class Log(BaseModel): + """Model for INFO logs.""" + time = IntegerField(default=lambda: int(time.time()), index=True) + email = CharField(max_length=255, null=True) + logmsg = TextField() + level = IntegerField(default=100) + cat = CharField(max_length=100, index=True, default="general") + payload = TextField(null=True) + payload_cat = CharField(max_length=100, null=True) + + class Meta: + table_name = 'logs' + +class Error(BaseModel): + """Model for ERROR logs.""" + time = IntegerField(default=lambda: int(time.time()), index=True) + last_seen = IntegerField(default=lambda: int(time.time()), index=True) + email = CharField(max_length=255, null=True) + logmsg = TextField() + stacktrace = TextField(null=True) + count = IntegerField(default=1) + cat = CharField(max_length=100, index=True, default="general") + payload = TextField(null=True) + payload_cat = CharField(max_length=100, null=True) + + class Meta: + table_name = 'errors' + +def init_db_logging(): + """Create tables if they don't exist.""" + with database: + database.create_tables([Log, Error], safe=True) + +class DatabaseLogHandler(logging.Handler): + """A logging handler that writes logs to the Peewee database.""" + def emit(self, record): + stacktrace = None + if record.exc_info: + stacktrace = logging.Formatter().formatException(record.exc_info) + + if record.levelno >= logging.ERROR: + log_error( + msg=record.getMessage(), + cat=record.name, + stacktrace=stacktrace + ) + else: + log_info( + msg=record.getMessage(), + level=record.levelno, + cat=record.name + ) + +def log_error(msg: str, cat: str = "general", email: Optional[str] = None, stacktrace: Optional[str] = None, payload: Optional[str] = None, payload_cat: Optional[str] = None): + """Log an ERROR message to the database, handling duplicates.""" + try: + log_info(msg=msg, cat=cat, email=email, payload=payload, payload_cat=payload_cat) + except Exception as e: + pass + try: + if not stacktrace: + # Capture the current stack trace if not provided + stacktrace = "".join(traceback.format_stack()) + + # Filter out irrelevant lines from the stack trace + if stacktrace: + lines = stacktrace.split('\n') + filtered_lines = [ + line for line in lines + if 'python3.13/logging' not in line and 'src/mylogging.py' not in line + ] + stacktrace = '\n'.join(filtered_lines) + + one_day_ago = int(time.time()) - (24 * 3600) + + # Look for a similar error in the last 24 hours from the same user + existing_error = Error.select().where( + (Error.logmsg == msg) & + (Error.email == email) & + (Error.last_seen >= one_day_ago) + ).first() + + if existing_error: + # If found, increment counter and update last_seen + existing_error.count += 1 + existing_error.last_seen = int(time.time()) + existing_error.stacktrace = stacktrace + existing_error.save() + print(existing_error) + else: + # Otherwise, create a new error record + Error.create( + logmsg=msg, + cat=cat, + email=email, + stacktrace=stacktrace, + payload=payload, + payload_cat=payload_cat + ) + logging.info(f"Successfully logged new error: {msg}") + + except Exception as e: + logging.error(f"Failed to log error to {DB_FILE}: {e}") + +def log_info(msg: str, level: int = 0, cat: str = "general", email: Optional[str] = None, payload: Optional[str] = None, payload_cat: Optional[str] = None): + """Log an INFO message to the database.""" + try: + Log.create(logmsg=msg, level=level, cat=cat, email=email, payload=payload, payload_cat=payload_cat) + except Exception as e: + print(f"Failed to log info to {DB_FILE}: {e}") + +def get_errors(search: Optional[str] = None, cat: Optional[str] = None) -> List[Dict[str, Any]]: + """Get errors from the database with optional filters. Category search is prefix-based.""" + query = Error.select().order_by(Error.last_seen.desc()) + if search: + query = query.where(Error.logmsg.contains(search)) + if cat and cat.strip(): + query = query.where(Error.cat.startswith(cat.strip())) + return [e.to_dict() for e in query] + +def get_logs( + search: Optional[str] = None, + cat: Optional[str] = None, + level: Optional[int] = None, + hours_ago: Optional[int] = None, +) -> List[Dict[str, Any]]: + """Get logs from the database with optional filters. Category search is prefix-based.""" + query = Log.select().order_by(Log.time.desc()) + + if search and search.strip(): + query = query.where(Log.logmsg.contains(search)) + + if cat and cat.strip(): + query = query.where(Log.cat.startswith(cat.strip())) + + if level is not None: + query = query.where(Log.level <= level) + + if hours_ago is not None: + time_ago = int(time.time()) - (hours_ago * 3600) + query = query.where(Log.time >= time_ago) + + return [l.to_dict() for l in query] + +def get_log_by_id(log_id: int) -> Optional[Dict[str, Any]]: + """Get a single log by its ID.""" + try: + log = Log.get_by_id(log_id) + return log.to_dict() + except Log.DoesNotExist: + return None + +def delete_logs_older_than(minutes: int): + """Delete logs older than a specified number of minutes.""" + time_ago = int(time.time()) - (minutes * 60) + Log.delete().where(Log.time < time_ago).execute() + +def delete_errors_older_than(minutes: int): + """Delete errors older than a specified number of minutes.""" + time_ago = int(time.time()) - (minutes * 60) + Error.delete().where(Error.time < time_ago).execute() + +def get_unique_log_categories() -> List[str]: + """Get unique log categories from the database.""" + query = (Log + .select(Log.cat) + .where(Log.cat.is_null(False)) + .distinct() + .order_by(Log.cat)) + return [l.cat for l in query] + +def get_unique_error_categories() -> List[str]: + """Get unique error categories from the database.""" + query = (Error + .select(Error.cat) + .where(Error.cat.is_null(False)) + .distinct() + .order_by(Error.cat)) + return [e.cat for e in query] \ No newline at end of file diff --git a/lib/core/pathlib/__pycache__/__init__.py b/lib/core/pathlib/__pycache__/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lib/crypt/__init__.py b/lib/crypt/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lib/crypt/box/__init__.py b/lib/crypt/box/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lib/data/__init__.py b/lib/data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lib/data/ourtime/__pycache__/__init__.py b/lib/data/ourtime/__pycache__/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lib/downloader/__init__.py b/lib/downloader/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lib/downloader/downloader.py b/lib/downloader/downloader.py deleted file mode 100644 index 15ce01b..0000000 --- a/lib/downloader/downloader.py +++ /dev/null @@ -1,412 +0,0 @@ -import json -import logging -import mimetypes # Added -import os -from datetime import datetime, timedelta -from urllib.parse import urljoin, urlparse - -import scrapy -from scrapy.crawler import CrawlerProcess -from scrapy.linkextractors import LinkExtractor -from scrapy.utils.project import get_project_settings - -# Configure logging -logging.basicConfig( - level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" -) -logger = logging.getLogger(__name__) - -STATE_FILE_NAME = ".download_state.json" - - -class GenericDownloaderSpider(scrapy.Spider): - name = "generic_downloader" - - def __init__( - self, - start_url, - dest_dir, - allowed_domains, - ignore_paths=None, - depth_limit=0, - follow_links=True, - max_age_hours=0, - state_data=None, - *args, - **kwargs, - ): - super(GenericDownloaderSpider, self).__init__(*args, **kwargs) - self.start_urls = [start_url] - self.dest_dir = dest_dir - self.allowed_domains = allowed_domains - self.ignore_paths = ignore_paths if ignore_paths else [] - self.depth_limit = int(depth_limit) - self.follow_links = bool(follow_links) - self.max_age_hours = int(max_age_hours) - self.state_data = state_data if state_data else {} - self.link_extractor = LinkExtractor(allow_domains=self.allowed_domains) - - os.makedirs(self.dest_dir, exist_ok=True) - logger.info(f"Downloader initialized for {start_url}") - logger.info(f"Destination directory: {self.dest_dir}") - logger.info(f"Allowed domains: {self.allowed_domains}") - logger.info(f"Ignore paths: {self.ignore_paths}") - logger.info(f"Depth limit: {self.depth_limit}") - logger.info(f"Follow links: {self.follow_links}") - logger.info(f"Max age (hours): {self.max_age_hours}") - - def _should_ignore(self, url_path): - for pattern in self.ignore_paths: - if pattern in url_path: # Simple substring match for now, can be regex - return True - return False - - def _get_file_path(self, response): # Changed signature to take response - url = response.url - parsed_url = urlparse(url) - original_path = parsed_url.path # e.g. /foo/bar.html or /foo/ or / - - # Determine base_name and current_ext from original_path - if original_path.endswith("/"): - base_name = "index" - current_ext = "" - # path_for_dirs is the path part that forms the directory structure - path_for_dirs = original_path.lstrip("/") - else: - path_basename = os.path.basename(original_path) - if ( - not path_basename and original_path == "/" - ): # Root path e.g. http://example.com - base_name = "index" - current_ext = "" - else: # e.g. /file.txt or /file_no_ext or /.config - base_name, current_ext = os.path.splitext(path_basename) - if not base_name and current_ext: # Hidden file like /.bashrc - base_name = current_ext # Treat .bashrc as base_name - current_ext = "" # No further extension part - path_for_dirs = os.path.dirname(original_path.lstrip("/")) - - # Try to get extension from Content-Type - content_type = ( - response.headers.get("Content-Type", b"") - .decode("utf-8") - .split(";")[0] - .strip() - ) - mime_ext = mimetypes.guess_extension(content_type) if content_type else None - - final_ext = current_ext - if mime_ext and not current_ext: # No path extension, use MIME type's - final_ext = mime_ext - elif ( - mime_ext - and current_ext.lower() in [".htm", ".html"] - and mime_ext - and mime_ext.lower() not in [".htm", ".html"] - ): - # Path had .html/.htm, but MIME type suggests something more specific - final_ext = mime_ext - logger.debug( - f"URL {url}: Path ext {current_ext} overridden by Content-Type ext {mime_ext}." - ) - elif not final_ext and ( - content_type.startswith("text/") - or content_type - in ["application/javascript", "application/json", "application/xml"] - ): - # Fallback for common text types if no extension determined yet and no path ext - if not base_name.endswith( - (".js", ".css", ".json", ".xml", ".txt") - ): # Avoid double .html.html - final_ext = ".html" - - filename = base_name + final_ext - - # Create path components for the directory structure - components = [] - if path_for_dirs: - components.extend(comp for comp in path_for_dirs.split("/") if comp) - components.append(filename) - - # Sanitize components - sane_components = [] - for comp_idx, comp_val in enumerate(components): - # Basic sanitization: replace invalid chars, limit length, avoid '..' - # Allow '.' for filenames but not as a full component name if it's not the only char - if comp_val == "..": - continue # Skip parent dir references in path construction - - sane_comp = "".join( - c if c.isalnum() or c in ["-", "_", "."] else "_" for c in comp_val - ) - sane_comp = sane_comp[:150] # Limit component length - - if ( - not sane_comp and comp_idx == len(components) - 1 - ): # last component (filename) became empty - sane_comp = "downloaded_file" + final_ext # fallback filename - elif not sane_comp: - sane_comp = "_" # placeholder for empty dir name - - if sane_comp: # Ensure component is not empty after sanitization - sane_components.append(sane_comp) - - if not sane_components: # If all components were sanitized away or skipped - sane_components = [filename if filename else "unknown_file" + final_ext] - - file_path = os.path.join(self.dest_dir, *sane_components) - return file_path - - def parse(self, response, depth=0): - url = response.url - logger.info(f"Processing URL (depth {depth}): {url}") - - parsed_url = urlparse(url) - if self._should_ignore(parsed_url.path): - logger.info(f"Ignoring URL (matches ignore_paths): {url}") - return - - file_path = self._get_file_path(response) # Pass response object - - # Check download state and max_age - if url in self.state_data: - url_state = self.state_data[url] - last_download_time_str = url_state.get("timestamp") - # Consider previous status; only skip if it was a success or another skip - can_skip_based_on_history = url_state.get("status", "").startswith( - "success" - ) or url_state.get("status", "").startswith("skipped") - - if last_download_time_str and can_skip_based_on_history: - last_download_time = datetime.fromisoformat(last_download_time_str) - if self.max_age_hours > 0 and ( - datetime.utcnow() - last_download_time - ) < timedelta(hours=self.max_age_hours): - logger.info( - f"Skipping download for {url}, recently processed at {last_download_time_str} with status '{url_state.get('status')}'." - ) - # Update state to reflect this skip check - self.state_data[url]["status"] = "skipped_max_age" - self.state_data[url]["skipped_timestamp"] = ( - datetime.utcnow().isoformat() - ) - # Still need to check for links if recursive - # Corrected depth condition: - # Follow if self.depth_limit is 0 (infinite) OR current depth is less than a positive limit. - if self.follow_links and ( - self.depth_limit == 0 or depth < self.depth_limit - ): - for link in self.link_extractor.extract_links(response): - parsed_link_url = urlparse(link.url) - if not self._should_ignore(parsed_link_url.path): - yield response.follow(link, callback=self.parse) - else: - logger.info( - f"Ignoring extracted link (matches ignore_paths): {link.url}" - ) - return - - logger.info(f"Processing and saving {url} to {file_path}") - os.makedirs(os.path.dirname(file_path), exist_ok=True) - - try: - with open(file_path, "wb") as f: - f.write(response.body) - logger.info(f"Successfully saved {url} to {file_path}") - self.state_data[url] = { - "timestamp": datetime.utcnow().isoformat(), - "status": "success", - "path": file_path, - "size": len(response.body), - } - except Exception as e: - logger.error(f"Failed to save {url} to {file_path}: {e}") - self.state_data[url] = { - "timestamp": datetime.utcnow().isoformat(), - "status": "failed", - "error": str(e), - } - return # Do not proceed further if save failed - - # Corrected depth condition for following links: - # Follow if self.depth_limit is 0 (infinite) OR current depth is less than a positive limit. - if self.follow_links and (self.depth_limit == 0 or depth < self.depth_limit): - logger.info( - f"Following links from {url} at custom depth {depth} (for next level {depth + 1})" - ) - extracted_links = list(self.link_extractor.extract_links(response)) - if not extracted_links: - logger.info(f" No links extracted from {url} by LinkExtractor.") - else: - logger.info( - f" LinkExtractor found {len(extracted_links)} links from {url}: {[l.url for l in extracted_links]}" - ) - - for link_idx, link in enumerate(extracted_links): - logger.debug( - f" Considering link {link_idx + 1}/{len(extracted_links)}: Text='{link.text}', URL='{link.url}'" - ) - parsed_link_url = urlparse(link.url) - if self._should_ignore(parsed_link_url.path): - logger.info( - f" Ignoring extracted link (matches ignore_paths): {link.url}" - ) - else: - logger.info( - f" Yielding request for: {link.url} (to be processed at custom depth {depth + 1})" - ) - yield response.follow(link, callback=self.parse) - - def closed(self, reason): - logger.info(f"Spider closed: {reason}. Finalizing and saving state.") - state_file_path = os.path.join(self.dest_dir, STATE_FILE_NAME) - try: - # Ensure the directory for the state file exists, though dest_dir should already. - os.makedirs(os.path.dirname(state_file_path), exist_ok=True) - with open(state_file_path, "w") as f: - json.dump(self.state_data, f, indent=4) - logger.info( - f"Spider successfully saved state ({len(self.state_data)} items) to {state_file_path}" - ) - except Exception as e: - logger.error( - f"Spider failed to save state to {state_file_path}: {e}", exc_info=True - ) - - -def download_site( - start_url, - dest_dir, - recursive=True, - ignore_paths=None, - depth_limit=0, # 0 means no limit if recursive is True - follow_links=True, # This is somewhat redundant if recursive is True, but good for clarity - max_age_hours=24, # Re-download if older than 24 hours -): - """ - Downloads a website or a single page. - - :param start_url: The URL to start downloading from. - :param dest_dir: The directory to save downloaded files. - :param recursive: Whether to download recursively. - :param ignore_paths: A list of path substrings or regex patterns to ignore. - :param depth_limit: Maximum depth for recursive downloads (0 for no limit). - :param follow_links: Whether to follow links on pages (primarily for recursive). - :param max_age_hours: Max age of a file in hours. If a file was downloaded - more recently than this, it won't be re-downloaded. - 0 means always re-download. - :return: A dictionary summarizing the download process. - """ - parsed_url = urlparse(start_url) - if not parsed_url.scheme or not parsed_url.netloc: - logger.error( - f"Invalid start_url: {start_url}. Must be a full URL (e.g., http://example.com)" - ) - return None - - allowed_domains = [parsed_url.hostname] # Changed from netloc to hostname - - state_file_path = os.path.join(dest_dir, STATE_FILE_NAME) - state_data = {} - if os.path.exists(state_file_path): - try: - with open(state_file_path, "r") as f: - state_data = json.load(f) - logger.info(f"Loaded download state from {state_file_path}") - except json.JSONDecodeError: - logger.warning( - f"Could not decode JSON from state file {state_file_path}. Starting fresh." - ) - except Exception as e: - logger.error( - f"Error loading state file {state_file_path}: {e}. Starting fresh." - ) - - settings = get_project_settings() - settings.set("ROBOTSTXT_OBEY", False) # Explicitly disable robots.txt - # settings.set('LOG_LEVEL', 'DEBUG') # Optionally enable for more Scrapy internal logs - - effective_scrapy_depth = 0 # Default for non-recursive or depth_limit=0 with recursion (0 means infinite for Scrapy) - if recursive and int(depth_limit) > 0: - effective_scrapy_depth = int(depth_limit) - # If not recursive, effective_scrapy_depth remains 0. - # If recursive and depth_limit is 0, effective_scrapy_depth remains 0 (infinite). - settings.set("DEPTH_LIMIT", effective_scrapy_depth) - - logger.info(f"Scrapy DEPTH_LIMIT set to: {effective_scrapy_depth}") - # Scrapy's DEPTH_PRIORITY and SCHEDULER_DISK_QUEUE might be useful for large crawls - # For now, keeping it simple. - - process = CrawlerProcess(settings) - - # The spider needs to be instantiated with all its custom args - # Scrapy's process.crawl can take kwargs which are passed to the spider's __init__ - process.crawl( - GenericDownloaderSpider, - start_url=start_url, - dest_dir=dest_dir, - allowed_domains=allowed_domains, - ignore_paths=ignore_paths, - depth_limit=int(depth_limit) - if recursive - else 0, # Spider handles its own depth based on this - follow_links=follow_links and recursive, - max_age_hours=int(max_age_hours), - state_data=state_data, - ) - - logger.info(f"Starting download process for {start_url}...") - process.start() # This will block until the crawl is finished - - # The spider's closed() method is now responsible for writing the final state. - # Load this definitive state to build the summary. - final_state_data_for_summary = {} - if os.path.exists(state_file_path): - try: - with open(state_file_path, "r") as f: - final_state_data_for_summary = json.load(f) - logger.info( - f"Loaded final state ({len(final_state_data_for_summary)} items) from {state_file_path} for summary construction." - ) - except json.JSONDecodeError as e: - logger.error( - f"Error decoding JSON from final state file {state_file_path} for summary: {e}. Summary will be based on empty or incomplete state." - ) - except Exception as e: - logger.error( - f"Error loading final state from {state_file_path} for summary: {e}. Summary will be based on empty or incomplete state." - ) - else: - logger.warning( - f"State file {state_file_path} not found after spider closed. Summary will be based on empty state." - ) - - summary = { - "start_url": start_url, - "dest_dir": dest_dir, - "total_processed_urls": len(final_state_data_for_summary), - "successful_downloads": 0, - "failed_downloads": 0, - "skipped_max_age": 0, - "total_bytes_downloaded": 0, - "state_file_path": state_file_path, - "errors": [], - } - - # Populate summary from the final_state_data_for_summary loaded from the file - for url_key, data_val in final_state_data_for_summary.items(): - status = data_val.get("status") - if status == "success": - summary["successful_downloads"] += 1 - summary["total_bytes_downloaded"] += data_val.get("size", 0) - elif status == "failed": - summary["failed_downloads"] += 1 - if "error" in data_val: - summary["errors"].append(f"URL: {url_key}, Error: {data_val['error']}") - elif status == "skipped_max_age": - summary["skipped_max_age"] += 1 - # Any errors during state file loading for summary should also be noted if critical - # For now, the logs capture it. If final_state_data_for_summary is empty due to load error, summary will reflect that. - - logger.info(f"Download process finished. Summary: {json.dumps(summary, indent=2)}") - return summary diff --git a/lib/downloader/scrape_dynamic b/lib/downloader/scrape_dynamic new file mode 120000 index 0000000..878fd62 --- /dev/null +++ b/lib/downloader/scrape_dynamic @@ -0,0 +1 @@ +../../../../tfgrid_research/tfdev/research/scrape_dynamic \ No newline at end of file diff --git a/lib/downloader/scrape_fast b/lib/downloader/scrape_fast new file mode 120000 index 0000000..239e333 --- /dev/null +++ b/lib/downloader/scrape_fast @@ -0,0 +1 @@ +../../../../tfgrid_research/tfdev/research/scrape_fast \ No newline at end of file diff --git a/lib/downloader/scrape_scapegraph b/lib/downloader/scrape_scapegraph new file mode 120000 index 0000000..adff26e --- /dev/null +++ b/lib/downloader/scrape_scapegraph @@ -0,0 +1 @@ +../../../../tfgrid_research/tfdev/research/scrape_scapegraph \ No newline at end of file diff --git a/lib/web/__init__.py b/lib/web/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lib/web/doctools/__init__.py b/lib/web/doctools/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pyproject.toml b/pyproject.toml index 250e437..9445434 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,15 +1,18 @@ [project] -name = "Herolib" -version = "0.9.0" -description = "Lib from Hero's project for Actors" -requires-python = ">=3.13" -dependencies = [ - "peewee>=3.17.0", - "pygments>=2.16.1", - "toml", - "requests>=2.31.0", - "beautifulsoup4>=4.12.2", - "pydantic>=2.8.0", - "scrapy==2.13.3", - "markdownify=1.1.0" +name = "herolib" +version = "0.1.0" +description = "A Python library for HeroCode" +authors = [ + { name = "Kilo Code", email = "kilo.code@example.com" } ] +readme = "README.md" +requires-python = ">=3.8" +dependencies = [] + +[build-system] +requires = ["setuptools>=61.0"] +build-backend = "setuptools.build_meta" + +[tool.setuptools.packages.find] +where = ["."] +include = ["lib*"] \ No newline at end of file