vertexcover-io · synacktraa · Feb 23, 2025 · Feb 23, 2025
diff --git a/eval/interactive/README.md b/eval/interactive/README.md
@@ -5,13 +5,11 @@ go run evaluator.go -input urls.txt
 ```
 > This will write to a JsON lines file `results_{<start-time>}.jsonl`
 
-## Render the final evaluation
+## Conver the json lines output to CSV and Markdown
 
 ```sh
-python render.py results_{<start-time>}.jsonl
+python convert.py results_{<start-time>}.jsonl
 ```
-> This will write to a Markdown file `rendered_results_{<start-time>}.md`
-
 
 ---
 
@@ -20,6 +18,5 @@ python render.py results_{<start-time>}.jsonl
 Each color in the annotated screenshots represents a different selection method:
 
 - 🔴 Red: Manual user selection
-- 🔵 Blue: Original locatr (without reranker)
-- 🟡 Yellow: Original locatr (with reranker)
+- 🔵 Blue: Original locatr (with reranker)
 - 🟢 Green: Anthropic grounding locatr
diff --git a/eval/interactive/convert.py b/eval/interactive/convert.py
@@ -0,0 +1,163 @@
+import json
+import sys
+import csv
+from pathlib import Path
+from collections import defaultdict
+
+
+def format_points(points):
+    """Format list of points into a semicolon-separated string."""
+    if not points:
+        return ""
+    return ";".join(f"({p['X']:.2f},{p['Y']:.2f})" for p in points)
+
+
+def jsonl_to_csv(input_file, output_file):
+    """Convert JSONL file to CSV format."""
+    # Define CSV headers
+    headers = [
+        "Url",
+        "ScrollCoordinates_X",
+        "ScrollCoordinates_Y",
+        "ElementDescription",
+        "ElementCoordinates_X",
+        "ElementCoordinates_Y",
+        "OriginalLocatr_InputTokens",
+        "OriginalLocatr_OutputTokens",
+        "OriginalLocatr_TotalTokens",
+        "OriginalLocatr_CostInDollars",
+        "OriginalLocatr_GeneratedPoints",
+        "AnthropicGroundingLocatr_InputTokens",
+        "AnthropicGroundingLocatr_OutputTokens",
+        "AnthropicGroundingLocatr_TotalTokens",
+        "AnthropicGroundingLocatr_CostInDollars",
+        "AnthropicGroundingLocatr_GeneratedPoints",
+        "ImagePath",
+    ]
+
+    with open(output_file, "w", newline="") as csvfile:
+        writer = csv.DictWriter(csvfile, fieldnames=headers)
+        writer.writeheader()
+
+        with open(input_file, "r") as jsonl:
+            for line in jsonl:
+                entry = json.loads(line.strip())
+
+                # Prepare the CSV row
+                row = {
+                    "Url": entry["Url"],
+                    "ScrollCoordinates_X": entry["ScrollCoordinates"]["X"],
+                    "ScrollCoordinates_Y": entry["ScrollCoordinates"]["Y"],
+                    "ElementDescription": entry["ElementDescription"],
+                    "ElementCoordinates_X": entry["ElementCoordinates"]["X"],
+                    "ElementCoordinates_Y": entry["ElementCoordinates"]["Y"],
+                    "ImagePath": entry.get("ImagePath", ""),
+                }
+
+                # Add data for each model
+                for model in ["originalLocatr", "anthropicGroundingLocatr"]:
+                    prefix = model.replace(
+                        "originalLocatr", "OriginalLocatr"
+                    ).replace(
+                        "anthropicGroundingLocatr", "AnthropicGroundingLocatr"
+                    )
+                    if model in entry["Outputs"] and entry["Outputs"][model]:
+                        output = entry["Outputs"][model]
+                        row.update(
+                            {
+                                f"{prefix}_InputTokens": output["InputTokens"],
+                                f"{prefix}_OutputTokens": output[
+                                    "OutputTokens"
+                                ],
+                                f"{prefix}_TotalTokens": output["TotalTokens"],
+                                f"{prefix}_CostInDollars": output[
+                                    "CostInDollars"
+                                ],
+                                f"{prefix}_GeneratedPoints": format_points(
+                                    output.get("GeneratedPoints", [])
+                                ),
+                            }
+                        )
+                    else:
+                        # Fill with empty values if model data is missing
+                        row.update(
+                            {
+                                f"{prefix}_InputTokens": "",
+                                f"{prefix}_OutputTokens": "",
+                                f"{prefix}_TotalTokens": "",
+                                f"{prefix}_CostInDollars": "",
+                                f"{prefix}_GeneratedPoints": "",
+                            }
+                        )
+
+                writer.writerow(row)
+
+
+def jsonl_to_markdown(input_file, output_file):
+    """Convert JSONL file to Markdown format with URL grouping."""
+
+    # First, group entries by URL
+    url_entries = defaultdict(list)
+    with open(input_file, "r") as f_in:
+        for line in f_in:
+            entry = json.loads(line.strip())
+            url_entries[entry["Url"]].append(entry)
+
+    with open(output_file, "w") as f_out:
+        for url, entries in url_entries.items():
+            # Write URL as main heading
+            f_out.write(f"# URL: {url}\n\n")
+
+            # Process each entry for this URL
+            for i, entry in enumerate(entries, 1):
+                f_out.write(f"## Entry {i}\n\n")
+
+                f_out.write(
+                    f"**Description**: {entry['ElementDescription']}\n\n"
+                )
+                f_out.write(
+                    f"**Coordinates**: X={entry['ElementCoordinates']['X']}, Y={entry['ElementCoordinates']['Y']}\n\n"
+                )
+                f_out.write(
+                    f"**Scroll To**: X={entry['ScrollCoordinates']['X']}, Y={entry['ScrollCoordinates']['Y']}\n\n"
+                )
+
+                for model, details in entry["Outputs"].items():
+                    f_out.write(f"#### `{model}`\n")
+
+                    # List all generated points
+                    f_out.write("- Generated Points:\n")
+                    for j, point in enumerate(details["GeneratedPoints"], 1):
+                        f_out.write(
+                            f"  - Point {j}: X={point['X']}, Y={point['Y']}\n"
+                        )
+                    f_out.write("\n")
+
+                    f_out.write(f"- Input Tokens: {details['InputTokens']}\n")
+                    f_out.write(f"- Output Tokens: {details['OutputTokens']}\n")
+                    f_out.write(f"- Total Tokens: {details['TotalTokens']}\n")
+                    f_out.write(
+                        f"- Cost in Dollars: {details['CostInDollars']}\n\n"
+                    )
+
+                # Write image path if exists
+                if "ImagePath" in entry:
+                    f_out.write("### Annotated Screenshot\n")
+                    f_out.write(f"![Screenshot]({entry['ImagePath']})\n\n")
+
+            # Add horizontal line between URLs
+            f_out.write("---\n\n")
+
+
+if __name__ == "__main__":
+    input_file = Path(sys.argv[1]).with_suffix(".jsonl")
+
+    # Create both markdown and CSV outputs
+    markdown_output = input_file.with_suffix(".md")
+    csv_output = input_file.with_suffix(".csv")
+
+    jsonl_to_markdown(input_file, markdown_output)
+    jsonl_to_csv(input_file, csv_output)
+    print("Conversion complete. Outputs written to:")
+    print(f"- Markdown: {markdown_output}")
+    print(f"- CSV: {csv_output}")
diff --git a/eval/interactive/evaluator.go b/eval/interactive/evaluator.go
@@ -14,6 +14,7 @@ import (
 	"image/draw"
 	"image/jpeg"
 	"log"
+	"log/slog"
 	"os"
 	"os/signal"
 	"regexp"
@@ -68,6 +69,19 @@ type Output struct {
 	InputTokens     int64
 	OutputTokens    int64
 	TotalTokens     int64
+	CostInDollars   float64
+}
+
+func calculateCost(inputTokens, outputTokens int, costPer1MInputTokens, costPer1MOutputTokens float64) float64 {
+	fmt.Printf("cost per 1 mil input tokens: %.5f\n", costPer1MInputTokens)
+	fmt.Printf("cost per 1 mil output tokens: %.5f\n", costPer1MOutputTokens)
+
+	// Convert to float64 before division
+	inputCost := (float64(inputTokens) / 1000000.0) * costPer1MInputTokens
+	outputCost := (float64(outputTokens) / 1000000.0) * costPer1MOutputTokens
+
+	fmt.Printf("Input cost: %.5f, Output cost: %.5f\n\n", inputCost, outputCost)
+	return inputCost + outputCost
 }
 
 type Captured struct {
@@ -82,7 +96,9 @@ type LocatrInterface interface {
 }
 
 type OriginalLocatr struct {
-	instance *playwrightLocatr.PlaywrightLocator
+	instance              *playwrightLocatr.PlaywrightLocator
+	costPer1MInputTokens  float64
+	costPer1MOutputTokens float64
 }
 
 func (l OriginalLocatr) call(page *playwright.Page, query string) (*Output, error) {
@@ -98,13 +114,19 @@ func (l OriginalLocatr) call(page *playwright.Page, query string) (*Output, erro
 		InputTokens:     int64(lastResult.InputTokens),
 		OutputTokens:    int64(lastResult.OutputTokens),
 		TotalTokens:     int64(lastResult.TotalTokens),
+		CostInDollars: calculateCost(
+			lastResult.InputTokens,
+			lastResult.OutputTokens,
+			l.costPer1MInputTokens,
+			l.costPer1MOutputTokens,
+		),
 	}
 
 	if len(lastResult.AllLocatrs) == 0 {
 		return &output, nil
 	}
 
-	points := make([]Point, len(lastResult.AllLocatrs))
+	points := []Point{}
 	appended := false
 	viewportSize := (*page).ViewportSize()
 
@@ -116,6 +138,7 @@ func (l OriginalLocatr) call(page *playwright.Page, query string) (*Output, erro
 		X := bbox.X + bbox.Width/2
 		Y := bbox.Y + bbox.Height/2
 
+		fmt.Printf("X: %.2f, Y: %.2f\n", X, Y)
 		// Check if the element is within the viewport
 		if X < 0 || Y < 0 ||
 			bbox.X > float64(viewportSize.Width) || bbox.Y > float64(viewportSize.Height) {
@@ -136,7 +159,9 @@ func (l OriginalLocatr) call(page *playwright.Page, query string) (*Output, erro
 const ANTHROPIC_GROUNDING_INSTRUCTION string = `Given the screen resolution of 1280x800, identify the exact (X, Y) coordinates for the described area, element, or object on a browser GUI screen.`
 
 type AnthropicGroundingLocatr struct {
-	client *anthropic.Client
+	client                *anthropic.Client
+	costPer1MInputTokens  float64
+	costPer1MOutputTokens float64
 }
 
 func (l AnthropicGroundingLocatr) call(page *playwright.Page, query string) (*Output, error) {
@@ -152,7 +177,6 @@ func (l AnthropicGroundingLocatr) call(page *playwright.Page, query string) (*Ou
 	if err != nil {
 		return nil, err
 	}
-	fmt.Printf("Tool input schema: %v\n", toolInputSchema)
 	anthropicToolInputSchema := anthropic.BetaToolInputSchemaParam{
 		Type:       anthropic.F(anthropic.BetaToolInputSchemaTypeObject),
 		Properties: anthropic.F(toolInputSchema["properties"]),
@@ -161,7 +185,7 @@ func (l AnthropicGroundingLocatr) call(page *playwright.Page, query string) (*Ou
 	response, err := l.client.Beta.Messages.New(
 		context.TODO(),
 		anthropic.BetaMessageNewParams{
-			Model:     anthropic.F(anthropic.ModelClaude3_5SonnetLatest),
+			Model:     anthropic.F(anthropic.ModelClaude3_5Sonnet20241022),
 			MaxTokens: anthropic.F(int64(1024)),
 			Messages: anthropic.F([]anthropic.BetaMessageParam{
 				{
@@ -215,6 +239,12 @@ func (l AnthropicGroundingLocatr) call(page *playwright.Page, query string) (*Ou
 		InputTokens:     response.Usage.InputTokens,
 		OutputTokens:    response.Usage.OutputTokens,
 		TotalTokens:     response.Usage.InputTokens + response.Usage.OutputTokens,
+		CostInDollars: calculateCost(
+			int(response.Usage.InputTokens),
+			int(response.Usage.OutputTokens),
+			l.costPer1MInputTokens,
+			l.costPer1MOutputTokens,
+		),
 	}
 	content := response.Content[0]
 
@@ -259,7 +289,7 @@ func (l AnthropicGroundingLocatr) call(page *playwright.Page, query string) (*Ou
 
 }
 
-func getOriginalLocatrInstance(page *playwright.Page, rerank bool) *playwrightLocatr.PlaywrightLocator {
+func getOriginalLocatrInstance(page *playwright.Page) *playwrightLocatr.PlaywrightLocator {
 	llmClient, err := llm.NewLlmClient(
 		llm.OpenAI, "gpt-4o", os.Getenv("OPENAI_API_KEY"),
 	)
@@ -268,10 +298,8 @@ func getOriginalLocatrInstance(page *playwright.Page, rerank bool) *playwrightLo
 		return nil
 	}
 	locatrOptions := locatr.BaseLocatrOptions{LlmClient: llmClient}
-	if rerank {
-		reRankClient := reranker.NewCohereClient(os.Getenv("COHERE_API_KEY"))
-		locatrOptions.ReRankClient = reRankClient
-	}
+	reRankClient := reranker.NewCohereClient(os.Getenv("COHERE_API_KEY"))
+	locatrOptions.ReRankClient = reRankClient
 	return playwrightLocatr.NewPlaywrightLocatr(*page, locatrOptions)
 }
 
@@ -415,16 +443,20 @@ func processURLs(urls []string) error {
 			return fmt.Errorf("failed to unmarshal captured elements: %v", err)
 		}
 
-		originalLocatrWithoutReranking := OriginalLocatr{instance: getOriginalLocatrInstance(&page, false)} // without Reranker
-		originalLocatrWithReranking := OriginalLocatr{instance: getOriginalLocatrInstance(&page, true)}
+		originalLocatr := OriginalLocatr{
+			instance:              getOriginalLocatrInstance(&page),
+			costPer1MInputTokens:  2.5,
+			costPer1MOutputTokens: 10.0,
+		}
 		anthropicGroundingLocatr := AnthropicGroundingLocatr{
-			client: anthropic.NewClient(option.WithAPIKey(os.Getenv("ANTHROPIC_API_KEY"))),
+			client:                anthropic.NewClient(option.WithAPIKey(os.Getenv("ANTHROPIC_API_KEY"))),
+			costPer1MInputTokens:  3.0,
+			costPer1MOutputTokens: 15.0,
 		}
 
-		redColor := color.RGBA{R: 255, G: 0, B: 0, A: 255}      // Selected manually
-		blueColor := color.RGBA{R: 0, G: 0, B: 255, A: 255}     // Generated by original Locator (without reranking)
-		yellowColor := color.RGBA{R: 255, G: 255, B: 0, A: 255} // Generated by original Locator (with reranking)
-		greenColor := color.RGBA{R: 0, G: 255, B: 0, A: 255}    // Generated by anthropic grounding locatr
+		redColor := color.RGBA{R: 255, G: 0, B: 0, A: 255}   // Selected manually
+		blueColor := color.RGBA{R: 0, G: 0, B: 255, A: 255}  // Generated by original Locator (with reranking)
+		greenColor := color.RGBA{R: 0, G: 255, B: 0, A: 255} // Generated by anthropic grounding locatr
 
 		// Process Captured Elements
 		for _, elem := range captured {
@@ -438,7 +470,8 @@ func processURLs(urls []string) error {
 
 			// Wait until scroll reaches the desired point
 			if _, err := page.WaitForFunction(
-				`([X, Y]) => window.scrollX == X && window.scrollY == Y`, []float64{scrollCoords.X, scrollCoords.Y},
+				`([X, Y]) =>Math.round(window.scrollX) == Math.round(X) && Math.round(window.scrollY) == Math.round(Y)`,
+				[]float64{scrollCoords.X, scrollCoords.Y},
 			); err != nil {
 				return fmt.Errorf("scroll verification failed: %v", err)
 			}
@@ -459,7 +492,7 @@ func processURLs(urls []string) error {
 				log.Fatalf("Failed to decode image: %v", err)
 			}
 
-			drawPoints(img, &[]Point{elem.ElementCoordinates}, redColor, 14)
+			drawPoints(img, &[]Point{elem.ElementCoordinates}, redColor, 15)
 			outputs := make(map[string]*Output, 3)
 
 			call := func(locatr LocatrInterface, name string, color color.Color, radius int) {
@@ -479,9 +512,8 @@ func processURLs(urls []string) error {
 				}
 			}
 
-			call(originalLocatrWithoutReranking, "originalLocatrWithoutReranking", blueColor, 12)
-			call(originalLocatrWithReranking, "originalLocatrWithReranking", yellowColor, 10)
-			call(anthropicGroundingLocatr, "anthropicGroundingLocatr", greenColor, 8)
+			call(originalLocatr, "originalLocatr", blueColor, 12)
+			call(anthropicGroundingLocatr, "anthropicGroundingLocatr", greenColor, 9)
 
 			// Convert back to bytes
 			finalBytes, err := imageToBytes(img)
@@ -549,6 +581,7 @@ func loadURLs(inputPath string) ([]string, error) {
 
 func main() {
 	// Load environment variables
+	logger.Level.Set(slog.LevelDebug)
 	if err := godotenv.Load(); err != nil {
 		log.Println("Error loading .env file")
 	}

diff --git a/eval/interactive/images/02db0c61-d893-48c4-a30b-39a71daa31dc.jpeg b/eval/interactive/images/02db0c61-d893-48c4-a30b-39a71daa31dc.jpeg
diff --git a/eval/interactive/images/02f781e6-d649-4ad5-af9b-7d18c1532e22.jpeg b/eval/interactive/images/02f781e6-d649-4ad5-af9b-7d18c1532e22.jpeg
diff --git a/eval/interactive/images/0e0f3a85-f8a6-4dd0-8004-7c6f259e1680.jpeg b/eval/interactive/images/0e0f3a85-f8a6-4dd0-8004-7c6f259e1680.jpeg
diff --git a/eval/interactive/images/3440ea2b-4db9-42b6-b22d-912037b064d4.jpeg b/eval/interactive/images/3440ea2b-4db9-42b6-b22d-912037b064d4.jpeg
diff --git a/eval/interactive/images/4073d76d-0b8d-43ae-a892-77fd982749b5.jpeg b/eval/interactive/images/4073d76d-0b8d-43ae-a892-77fd982749b5.jpeg
diff --git a/eval/interactive/images/47af2bc0-e8b8-4591-a68b-8fc2efef101d.jpeg b/eval/interactive/images/47af2bc0-e8b8-4591-a68b-8fc2efef101d.jpeg
diff --git a/eval/interactive/images/5e6fd6ae-115d-4187-83f3-0d725856f892.jpeg b/eval/interactive/images/5e6fd6ae-115d-4187-83f3-0d725856f892.jpeg
diff --git a/eval/interactive/images/6c5ca95c-c185-4181-bb24-e729b278bd35.jpeg b/eval/interactive/images/6c5ca95c-c185-4181-bb24-e729b278bd35.jpeg
diff --git a/eval/interactive/images/6f9e0683-130c-4e5a-ae05-2cd4a1792792.jpeg b/eval/interactive/images/6f9e0683-130c-4e5a-ae05-2cd4a1792792.jpeg
diff --git a/eval/interactive/images/996fb3ec-9a6b-45d5-a418-dca7523e9a1e.jpeg b/eval/interactive/images/996fb3ec-9a6b-45d5-a418-dca7523e9a1e.jpeg
diff --git a/eval/interactive/images/a0fbd67e-d20d-4d8b-840f-e2133a549983.jpeg b/eval/interactive/images/a0fbd67e-d20d-4d8b-840f-e2133a549983.jpeg
diff --git a/eval/interactive/images/a2b7fdc3-b895-41e7-88e6-8b545a53f506.jpeg b/eval/interactive/images/a2b7fdc3-b895-41e7-88e6-8b545a53f506.jpeg
diff --git a/eval/interactive/images/b51d2716-d2cd-4d77-bd32-2818e9b587db.jpeg b/eval/interactive/images/b51d2716-d2cd-4d77-bd32-2818e9b587db.jpeg
diff --git a/eval/interactive/images/cc04b113-bb91-49fe-885f-823783f45d93.jpeg b/eval/interactive/images/cc04b113-bb91-49fe-885f-823783f45d93.jpeg
diff --git a/eval/interactive/images/cca9b9f4-7969-4a70-a9d7-902254bfb0b9.jpeg b/eval/interactive/images/cca9b9f4-7969-4a70-a9d7-902254bfb0b9.jpeg
diff --git a/eval/interactive/images/d4dc6683-ed9a-49bb-9b3c-b49928c374d9.jpeg b/eval/interactive/images/d4dc6683-ed9a-49bb-9b3c-b49928c374d9.jpeg
diff --git a/eval/interactive/images/dd2f43b7-a96e-42a6-8157-805042a204c6.jpeg b/eval/interactive/images/dd2f43b7-a96e-42a6-8157-805042a204c6.jpeg
diff --git a/eval/interactive/images/fa79f33f-dc1a-4f53-9648-6f7364f9c279.jpeg b/eval/interactive/images/fa79f33f-dc1a-4f53-9648-6f7364f9c279.jpeg
diff --git a/eval/interactive/images/ff4c91f5-d69f-465d-82a8-434670cee6a7.jpeg b/eval/interactive/images/ff4c91f5-d69f-465d-82a8-434670cee6a7.jpeg