...
This commit is contained in:
0
lib/clients/whisper/__init__.py
Normal file
0
lib/clients/whisper/__init__.py
Normal file
107
lib/clients/whisper/convert.py
Normal file
107
lib/clients/whisper/convert.py
Normal file
@@ -0,0 +1,107 @@
|
||||
import os
|
||||
from pydub import AudioSegment
|
||||
import whisper
|
||||
import moviepy.editor as mp
|
||||
import nltk
|
||||
from nltk.tokenize import sent_tokenize, word_tokenize
|
||||
|
||||
# Download necessary NLTK data
|
||||
nltk.download('punkt', quiet=True)
|
||||
|
||||
class Convertor:
|
||||
def __init__(self, max_chars_per_part=4000,context:str = "main"):
|
||||
self.max_chars_per_part = max_chars_per_part
|
||||
self.context = context
|
||||
|
||||
@classmethod
|
||||
def new(cls, max_chars_per_part=4000):
|
||||
return cls(max_chars_per_part)
|
||||
|
||||
def process(self, path: str):
|
||||
if path.lower().endswith(('.mp4', '.avi', '.mov')): # Video files
|
||||
return self.process_video(path)
|
||||
elif path.lower().endswith(('.mp3', '.wav', '.ogg')): # Audio files
|
||||
return self.process_audio(path)
|
||||
else:
|
||||
raise ValueError("Unsupported file format")
|
||||
|
||||
def process_video(self, video_path: str):
|
||||
# Extract audio from video
|
||||
video = mp.VideoFileClip(video_path)
|
||||
audio_path = video_path.rsplit('.', 1)[0] + '.wav'
|
||||
video.audio.write_audiofile(audio_path)
|
||||
video.close()
|
||||
return audio_path
|
||||
|
||||
def process_audio(self, audio_path: str):
|
||||
# Convert to WAV format if necessary
|
||||
wav_path = audio_path.rsplit('.', 1)[0] + '.wav'
|
||||
if not audio_path.lower().endswith('.wav'):
|
||||
audio = AudioSegment.from_file(audio_path)
|
||||
audio.export(wav_path, format='wav')
|
||||
else:
|
||||
wav_path = audio_path
|
||||
|
||||
def split_text(self, text):
|
||||
parts = []
|
||||
current_part = ""
|
||||
paragraphs = text.split('\n\n')
|
||||
|
||||
for paragraph in paragraphs:
|
||||
sentences = sent_tokenize(paragraph)
|
||||
for sentence in sentences:
|
||||
if len(current_part) + len(sentence) < self.max_chars_per_part:
|
||||
current_part += sentence + ' '
|
||||
else:
|
||||
if current_part:
|
||||
parts.append(current_part.strip())
|
||||
current_part = sentence + ' '
|
||||
|
||||
# Add a paragraph break if it doesn't exceed the limit
|
||||
if len(current_part) + 2 < self.max_chars_per_part:
|
||||
current_part += '\n\n'
|
||||
else:
|
||||
parts.append(current_part.strip())
|
||||
current_part = '\n\n'
|
||||
|
||||
if current_part:
|
||||
parts.append(current_part.strip())
|
||||
|
||||
return parts
|
||||
|
||||
def find_natural_pause(self, text):
|
||||
words = word_tokenize(text)
|
||||
total_words = len(words)
|
||||
mid_point = total_words // 2
|
||||
|
||||
# Look for punctuation near the middle
|
||||
for i in range(mid_point, total_words):
|
||||
if words[i] in '.!?':
|
||||
return ' '.join(words[:i+1]), ' '.join(words[i+1:])
|
||||
|
||||
# If no punctuation found, split at the nearest space to the middle
|
||||
return ' '.join(words[:mid_point]), ' '.join(words[mid_point:])
|
||||
|
||||
def write_to_file(self, parts, output_path):
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
for i, part in enumerate(parts, 1):
|
||||
f.write(f"Part {i}:\n\n")
|
||||
f.write(part)
|
||||
f.write("\n\n")
|
||||
if i < len(parts):
|
||||
f.write("-" * 50 + "\n\n")
|
||||
|
||||
|
||||
# Usage example:
|
||||
if __name__ == "__main__":
|
||||
processor = Convertor.new()
|
||||
item = "/Users/despiegk1/Documents/Zoom/2024-07-16 16.42.50 Kristof De Spiegeleer's Personal Meeting Room/video1720369800.mp4"
|
||||
transcription_parts = processor.process(item)
|
||||
|
||||
processor.write_to_file(transcription_parts, output_file)
|
||||
|
||||
print(f"Transcription split into {len(transcription_parts)} parts:")
|
||||
for i, part in enumerate(transcription_parts, 1):
|
||||
print(f"Part {i}:")
|
||||
print(part)
|
||||
print("-" * 50)
|
118
lib/clients/whisper/whisper.py
Normal file
118
lib/clients/whisper/whisper.py
Normal file
@@ -0,0 +1,118 @@
|
||||
import os
|
||||
from pydub import AudioSegment
|
||||
import whisper
|
||||
import moviepy.editor as mp
|
||||
import nltk
|
||||
from nltk.tokenize import sent_tokenize, word_tokenize
|
||||
|
||||
# Download necessary NLTK data
|
||||
nltk.download('punkt', quiet=True)
|
||||
|
||||
class MediaProcessor:
|
||||
def __init__(self, max_chars_per_part=4000):
|
||||
self.model = whisper.load_model("base.en")
|
||||
#self.model = whisper.load_model("medium.en")
|
||||
self.max_chars_per_part = max_chars_per_part
|
||||
|
||||
@classmethod
|
||||
def new(cls, max_chars_per_part=4000):
|
||||
return cls(max_chars_per_part)
|
||||
|
||||
def process(self, path: str):
|
||||
if path.lower().endswith(('.mp4', '.avi', '.mov')): # Video files
|
||||
return self.process_video(path)
|
||||
elif path.lower().endswith(('.mp3', '.wav', '.ogg')): # Audio files
|
||||
return self.process_audio(path)
|
||||
else:
|
||||
raise ValueError("Unsupported file format")
|
||||
|
||||
def process_video(self, video_path: str):
|
||||
# Extract audio from video
|
||||
video = mp.VideoFileClip(video_path)
|
||||
audio_path = video_path.rsplit('.', 1)[0] + '.wav'
|
||||
video.audio.write_audiofile(audio_path)
|
||||
video.close()
|
||||
|
||||
# Now process the extracted audio
|
||||
return self.process_audio(audio_path)
|
||||
|
||||
def process_audio(self, audio_path: str):
|
||||
# Convert to WAV format if necessary
|
||||
wav_path = audio_path.rsplit('.', 1)[0] + '.wav'
|
||||
if not audio_path.lower().endswith('.wav'):
|
||||
audio = AudioSegment.from_file(audio_path)
|
||||
audio.export(wav_path, format='wav')
|
||||
else:
|
||||
wav_path = audio_path
|
||||
|
||||
# Transcribe audio using Whisper
|
||||
result = self.model.transcribe(wav_path)
|
||||
transcription = result["text"]
|
||||
|
||||
# Split the transcription into parts
|
||||
return self.split_text(transcription)
|
||||
|
||||
def split_text(self, text):
|
||||
parts = []
|
||||
current_part = ""
|
||||
paragraphs = text.split('\n\n')
|
||||
|
||||
for paragraph in paragraphs:
|
||||
sentences = sent_tokenize(paragraph)
|
||||
for sentence in sentences:
|
||||
if len(current_part) + len(sentence) < self.max_chars_per_part:
|
||||
current_part += sentence + ' '
|
||||
else:
|
||||
if current_part:
|
||||
parts.append(current_part.strip())
|
||||
current_part = sentence + ' '
|
||||
|
||||
# Add a paragraph break if it doesn't exceed the limit
|
||||
if len(current_part) + 2 < self.max_chars_per_part:
|
||||
current_part += '\n\n'
|
||||
else:
|
||||
parts.append(current_part.strip())
|
||||
current_part = '\n\n'
|
||||
|
||||
if current_part:
|
||||
parts.append(current_part.strip())
|
||||
|
||||
return parts
|
||||
|
||||
def find_natural_pause(self, text):
|
||||
words = word_tokenize(text)
|
||||
total_words = len(words)
|
||||
mid_point = total_words // 2
|
||||
|
||||
# Look for punctuation near the middle
|
||||
for i in range(mid_point, total_words):
|
||||
if words[i] in '.!?':
|
||||
return ' '.join(words[:i+1]), ' '.join(words[i+1:])
|
||||
|
||||
# If no punctuation found, split at the nearest space to the middle
|
||||
return ' '.join(words[:mid_point]), ' '.join(words[mid_point:])
|
||||
|
||||
def write_to_file(self, parts, output_path):
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
for i, part in enumerate(parts, 1):
|
||||
f.write(f"Part {i}:\n\n")
|
||||
f.write(part)
|
||||
f.write("\n\n")
|
||||
if i < len(parts):
|
||||
f.write("-" * 50 + "\n\n")
|
||||
|
||||
|
||||
# Usage example:
|
||||
if __name__ == "__main__":
|
||||
processor = MediaProcessor.new(max_chars_per_part=10000)
|
||||
output_file = "/Users/despiegk1/Documents/transcription3.md"
|
||||
item = "/Users/despiegk1/Documents/Zoom/2024-07-16 16.42.50 Kristof De Spiegeleer's Personal Meeting Room/video1720369800.mp4"
|
||||
transcription_parts = processor.process(item)
|
||||
|
||||
processor.write_to_file(transcription_parts, output_file)
|
||||
|
||||
print(f"Transcription split into {len(transcription_parts)} parts:")
|
||||
for i, part in enumerate(transcription_parts, 1):
|
||||
print(f"Part {i}:")
|
||||
print(part)
|
||||
print("-" * 50)
|
Reference in New Issue
Block a user