...

2025-08-05 15:15:36 +02:00
parent 4bd960ed05
commit 7fabb4163a
192 changed files with 14901 additions and 0 deletions
--- a/examples/downloader/threefold_scraper.py
+++ b/examples/downloader/threefold_scraper.py
@@ -0,0 +1,134 @@
+#!/usr/bin/env python3
+
+import argparse
+import json
+import os
+import tempfile
+
+import html2text
+import lmstudio as lms
+import requests
+import scrapy
+from IPython import embed
+from openai import OpenAI
+from scrapy.crawler import CrawlerProcess
+from scrapy.http import Request
+
+client = OpenAI(base_url="http://localhost:1234/v1", api_key="YOUR_ANY_API_KEY")
+# api_key is usually required, but for LM Studio it might not be strictly necessary.
+# client = OpenAI(base_url="http://localhost:1234/v1", api_key="YOUR_ANY_API_KEY")
+# openai.api_key = "YOUR_ANY_API_KEY" # Replace with your actual key if needed
+
+
+class ThreeFoldDocsSpider(scrapy.Spider):
+    name = "threefold_docs"
+    start_urls = ["https://threefold.info/tech/docs/"]
+
+    def parse(self, response):
+        # Extract the main content
+        content = response.css("main").get()
+
+        # Convert HTML to markdown using LMStudio
+        markdown_content = convert_html_to_markdown_with_lmstudio(content)
+
+        # Save the content
+        if markdown_content:
+            # Remove leading whitespace and markdown code block fence if present
+            markdown_content = markdown_content.lstrip()
+            if markdown_content.startswith("```markdown"):
+                markdown_content = markdown_content[len("```markdown\n") :]
+            elif markdown_content.startswith("```"):
+                markdown_content = markdown_content[len("```\n") :]
+
+            with open("threefold_docs.md", "w", encoding="utf-8") as f:
+                f.write(markdown_content)
+
+            self.log(f"Saved content to threefold_docs.md")
+        else:
+            self.log(f"Could not convert HTML to Markdown for {response.url}")
+
+
+def convert_html_to_markdown_with_lmstudio(html_content):
+    """Convert HTML to Markdown using LMStudio with jinaai.readerlm-v2"""
+    try:
+        # Use the OpenAI-compatible API provided by LMStudio
+        response = client.chat.completions.create(
+            model="jinaai/ReaderLM-v2",  # Assuming this is the correct model ID
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are a helpful assistant that converts HTML to Markdown.",
+                },
+                {
+                    "role": "user",
+                    "content": f"Please convert the following HTML to Markdown:\n\n{html_content}",
+                },
+            ],
+            stream=False,  # Set to True if streaming is desired
+        )
+        return response.choices[0].message.content
+    except Exception as e:
+        print(f"Error converting HTML to Markdown with LMStudio: {e}")
+        return None
+
+
+def scrape_threefold_docs():
+    """Run the Scrapy spider to scrape ThreeFold docs"""
+    process = CrawlerProcess(
+        {
+            "USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+            "LOG_LEVEL": "INFO",
+        }
+    )
+
+    process.crawl(ThreeFoldDocsSpider)
+    process.start()
+
+    return "threefold_docs.md"
+
+
+# Note: The lmstudio Python library primarily provides an OpenAI-compatible client.
+# Automated model downloading and loading are typically handled manually in the
+# LM Studio application or through its local server API, not directly via this client.
+# Ensure the "jinaai/jina-embeddings-v2-base-en" model (or the desired Jina embeddings v3 model if available)
+# is downloaded and loaded in your LM Studio application before running this script.
+
+
+def get_embedding_with_lmstudio(text, model_name="jinaai/jina-embeddings-v2-base-en"):
+    """Get embedding for text using LM Studio with the specified model."""
+    # Ensure the "jinaai/jina-embeddings-v3" model is downloaded and loaded in your LM Studio application before running this script.
+    try:
+        # Use the OpenAI-compatible API for embeddings
+        response = client.embeddings.create(model=model_name, input=[text])
+        return response.data[0].embedding
+    except Exception as e:
+        print(f"Error getting embedding with LMStudio: {e}")
+        print("Please ensure LM Studio is running and the specified model is loaded.")
+        return None
+
+
+def main():
+    model_to_use = "jinaai/jina-embeddings-v2-base-en"
+
+    markdown_file = scrape_threefold_docs()
+
+    embed()
+
+    if os.path.exists(markdown_file):
+        with open(markdown_file, "r", encoding="utf-8") as f:
+            content = f.read()
+
+        # Example usage of the embedding function
+        embedding = get_embedding_with_lmstudio(content, model_to_use)
+        if embedding:
+            print(
+                f"Successfully generated embedding (first 10 dimensions): {embedding[:10]}..."
+            )
+        else:
+            print("Failed to generate embedding.")
+
+    # Model unloading should be done manually in LM Studio.
+
+
+if __name__ == "__main__":
+    main()