#!/usr/bin/env python3 import argparse import json import os import tempfile import html2text import lmstudio as lms import requests import scrapy from IPython import embed from openai import OpenAI from scrapy.crawler import CrawlerProcess from scrapy.http import Request client = OpenAI(base_url="http://localhost:1234/v1", api_key="YOUR_ANY_API_KEY") # api_key is usually required, but for LM Studio it might not be strictly necessary. # client = OpenAI(base_url="http://localhost:1234/v1", api_key="YOUR_ANY_API_KEY") # openai.api_key = "YOUR_ANY_API_KEY" # Replace with your actual key if needed class ThreeFoldDocsSpider(scrapy.Spider): name = "threefold_docs" start_urls = ["https://threefold.info/tech/docs/"] def parse(self, response): # Extract the main content content = response.css("main").get() # Convert HTML to markdown using LMStudio markdown_content = convert_html_to_markdown_with_lmstudio(content) # Save the content if markdown_content: # Remove leading whitespace and markdown code block fence if present markdown_content = markdown_content.lstrip() if markdown_content.startswith("```markdown"): markdown_content = markdown_content[len("```markdown\n") :] elif markdown_content.startswith("```"): markdown_content = markdown_content[len("```\n") :] with open("threefold_docs.md", "w", encoding="utf-8") as f: f.write(markdown_content) self.log(f"Saved content to threefold_docs.md") else: self.log(f"Could not convert HTML to Markdown for {response.url}") def convert_html_to_markdown_with_lmstudio(html_content): """Convert HTML to Markdown using LMStudio with jinaai.readerlm-v2""" try: # Use the OpenAI-compatible API provided by LMStudio response = client.chat.completions.create( model="jinaai/ReaderLM-v2", # Assuming this is the correct model ID messages=[ { "role": "system", "content": "You are a helpful assistant that converts HTML to Markdown.", }, { "role": "user", "content": f"Please convert the following HTML to Markdown:\n\n{html_content}", }, ], stream=False, # Set to True if streaming is desired ) return response.choices[0].message.content except Exception as e: print(f"Error converting HTML to Markdown with LMStudio: {e}") return None def scrape_threefold_docs(): """Run the Scrapy spider to scrape ThreeFold docs""" process = CrawlerProcess( { "USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", "LOG_LEVEL": "INFO", } ) process.crawl(ThreeFoldDocsSpider) process.start() return "threefold_docs.md" # Note: The lmstudio Python library primarily provides an OpenAI-compatible client. # Automated model downloading and loading are typically handled manually in the # LM Studio application or through its local server API, not directly via this client. # Ensure the "jinaai/jina-embeddings-v2-base-en" model (or the desired Jina embeddings v3 model if available) # is downloaded and loaded in your LM Studio application before running this script. def get_embedding_with_lmstudio(text, model_name="jinaai/jina-embeddings-v2-base-en"): """Get embedding for text using LM Studio with the specified model.""" # Ensure the "jinaai/jina-embeddings-v3" model is downloaded and loaded in your LM Studio application before running this script. try: # Use the OpenAI-compatible API for embeddings response = client.embeddings.create(model=model_name, input=[text]) return response.data[0].embedding except Exception as e: print(f"Error getting embedding with LMStudio: {e}") print("Please ensure LM Studio is running and the specified model is loaded.") return None def main(): model_to_use = "jinaai/jina-embeddings-v2-base-en" markdown_file = scrape_threefold_docs() embed() if os.path.exists(markdown_file): with open(markdown_file, "r", encoding="utf-8") as f: content = f.read() # Example usage of the embedding function embedding = get_embedding_with_lmstudio(content, model_to_use) if embedding: print( f"Successfully generated embedding (first 10 dimensions): {embedding[:10]}..." ) else: print("Failed to generate embedding.") # Model unloading should be done manually in LM Studio. if __name__ == "__main__": main()