-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_mining.py
100 lines (77 loc) · 3.59 KB
/
data_mining.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
from pathlib import Path
import json
from anthropic_processor import AnthropicProcessor
def analyze_video_content():
# Initialize AnthropicProcessor
processor = AnthropicProcessor()
# Load combined data
combined_data_path = Path("combined_data.json")
if not combined_data_path.exists():
print("combined_data.json not found")
return
with combined_data_path.open('r', encoding='utf-8') as f:
combined_data = json.load(f)
analysis_results = {}
from itertools import islice
first_two = list(islice(combined_data.items(), 2))
print(f"First timestamp: {first_two[0][0]}")
print(f"First item data: {first_two[0][1]}")
print(f"\nSecond timestamp: {first_two[1][0]}")
print(f"Second item data: {first_two[1][1]}")
# System prompt for content analysis
system_prompt = """
You will be given the OCR text extracted from video frames
Provide the list of all the restaurants mentioned in the transcription.
For the name of the restaurant, use the name from the OCR Text.
Include description of the restaurant.
You must give answer in json format with the following format:
{
"restaurants": [
{
"name": "restaurant name",
"description": "restaurant description"
}
]
}
"""
# , the audio transcription, and the Description.
# You are an audio transcriber. You have already transcribed the audio, but unsure of the quality of the transcription.
# You will be given the OCR text extracted from video frames, the audio transcription, and the Description. Correct any errors in the transcription.
# When you make a revision, replace text with the OCR Text extracted from video frames.
# 1. OCR Text
# 2. Transcription
# 3. Description
# First give me the revised transcription with correct wordings.
# then Provide all the restaurants mentioned in the transcription. Include description of the restaurant.
# Process each video's content
# for timestamp, content in first_two:
for timestamp, content in combined_data.items():
print(f"\nAnalyzing content from: {timestamp}")
# Combine all available content
content_parts = []
if content.get("ocr"):
content_parts.append("OCR Text:\n" + str(content["ocr"]))
# if content.get("transcription"):
# content_parts.append("Transcription:\n" + str(content["transcription"]))
# if content.get("description"):
# content_parts.append("Description:\n" + str(content["description"]))
if not content_parts:
print(f"No content available for {timestamp}")
continue
# Combine all content parts
combined_text = "\n\n".join(content_parts)
print(combined_text)
try:
# Process the combined content
analysis = processor.process_text(combined_text, system_prompt)
analysis_results[timestamp] = analysis
print(f"Successfully analyzed content from {timestamp}")
except Exception as e:
print(f"Error analyzing content from {timestamp}: {str(e)}")
analysis_results[timestamp] = f"Error: {str(e)}"
# Save analysis results
output_path = Path("content_analysis.json")
with output_path.open('w', encoding='utf-8') as f:
json.dump(analysis_results, f, indent=4, ensure_ascii=False)
print(f"\nAnalysis complete. Results saved to {output_path}")
analyze_video_content()