...
This commit is contained in:
134
examples/downloader/threefold_scraper.py
Executable file
134
examples/downloader/threefold_scraper.py
Executable file
@@ -0,0 +1,134 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
import html2text
|
||||
import lmstudio as lms
|
||||
import requests
|
||||
import scrapy
|
||||
from IPython import embed
|
||||
from openai import OpenAI
|
||||
from scrapy.crawler import CrawlerProcess
|
||||
from scrapy.http import Request
|
||||
|
||||
client = OpenAI(base_url="http://localhost:1234/v1", api_key="YOUR_ANY_API_KEY")
|
||||
# api_key is usually required, but for LM Studio it might not be strictly necessary.
|
||||
# client = OpenAI(base_url="http://localhost:1234/v1", api_key="YOUR_ANY_API_KEY")
|
||||
# openai.api_key = "YOUR_ANY_API_KEY" # Replace with your actual key if needed
|
||||
|
||||
|
||||
class ThreeFoldDocsSpider(scrapy.Spider):
|
||||
name = "threefold_docs"
|
||||
start_urls = ["https://threefold.info/tech/docs/"]
|
||||
|
||||
def parse(self, response):
|
||||
# Extract the main content
|
||||
content = response.css("main").get()
|
||||
|
||||
# Convert HTML to markdown using LMStudio
|
||||
markdown_content = convert_html_to_markdown_with_lmstudio(content)
|
||||
|
||||
# Save the content
|
||||
if markdown_content:
|
||||
# Remove leading whitespace and markdown code block fence if present
|
||||
markdown_content = markdown_content.lstrip()
|
||||
if markdown_content.startswith("```markdown"):
|
||||
markdown_content = markdown_content[len("```markdown\n") :]
|
||||
elif markdown_content.startswith("```"):
|
||||
markdown_content = markdown_content[len("```\n") :]
|
||||
|
||||
with open("threefold_docs.md", "w", encoding="utf-8") as f:
|
||||
f.write(markdown_content)
|
||||
|
||||
self.log(f"Saved content to threefold_docs.md")
|
||||
else:
|
||||
self.log(f"Could not convert HTML to Markdown for {response.url}")
|
||||
|
||||
|
||||
def convert_html_to_markdown_with_lmstudio(html_content):
|
||||
"""Convert HTML to Markdown using LMStudio with jinaai.readerlm-v2"""
|
||||
try:
|
||||
# Use the OpenAI-compatible API provided by LMStudio
|
||||
response = client.chat.completions.create(
|
||||
model="jinaai/ReaderLM-v2", # Assuming this is the correct model ID
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant that converts HTML to Markdown.",
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"Please convert the following HTML to Markdown:\n\n{html_content}",
|
||||
},
|
||||
],
|
||||
stream=False, # Set to True if streaming is desired
|
||||
)
|
||||
return response.choices[0].message.content
|
||||
except Exception as e:
|
||||
print(f"Error converting HTML to Markdown with LMStudio: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def scrape_threefold_docs():
|
||||
"""Run the Scrapy spider to scrape ThreeFold docs"""
|
||||
process = CrawlerProcess(
|
||||
{
|
||||
"USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
||||
"LOG_LEVEL": "INFO",
|
||||
}
|
||||
)
|
||||
|
||||
process.crawl(ThreeFoldDocsSpider)
|
||||
process.start()
|
||||
|
||||
return "threefold_docs.md"
|
||||
|
||||
|
||||
# Note: The lmstudio Python library primarily provides an OpenAI-compatible client.
|
||||
# Automated model downloading and loading are typically handled manually in the
|
||||
# LM Studio application or through its local server API, not directly via this client.
|
||||
# Ensure the "jinaai/jina-embeddings-v2-base-en" model (or the desired Jina embeddings v3 model if available)
|
||||
# is downloaded and loaded in your LM Studio application before running this script.
|
||||
|
||||
|
||||
def get_embedding_with_lmstudio(text, model_name="jinaai/jina-embeddings-v2-base-en"):
|
||||
"""Get embedding for text using LM Studio with the specified model."""
|
||||
# Ensure the "jinaai/jina-embeddings-v3" model is downloaded and loaded in your LM Studio application before running this script.
|
||||
try:
|
||||
# Use the OpenAI-compatible API for embeddings
|
||||
response = client.embeddings.create(model=model_name, input=[text])
|
||||
return response.data[0].embedding
|
||||
except Exception as e:
|
||||
print(f"Error getting embedding with LMStudio: {e}")
|
||||
print("Please ensure LM Studio is running and the specified model is loaded.")
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
model_to_use = "jinaai/jina-embeddings-v2-base-en"
|
||||
|
||||
markdown_file = scrape_threefold_docs()
|
||||
|
||||
embed()
|
||||
|
||||
if os.path.exists(markdown_file):
|
||||
with open(markdown_file, "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
|
||||
# Example usage of the embedding function
|
||||
embedding = get_embedding_with_lmstudio(content, model_to_use)
|
||||
if embedding:
|
||||
print(
|
||||
f"Successfully generated embedding (first 10 dimensions): {embedding[:10]}..."
|
||||
)
|
||||
else:
|
||||
print("Failed to generate embedding.")
|
||||
|
||||
# Model unloading should be done manually in LM Studio.
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Reference in New Issue
Block a user