Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed bugs in element ID locator and added cost calculation in interactive evaluator #37

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 3 additions & 6 deletions eval/interactive/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,11 @@ go run evaluator.go -input urls.txt
```
> This will write to a JsON lines file `results_{<start-time>}.jsonl`

## Render the final evaluation
## Conver the json lines output to CSV and Markdown

```sh
python render.py results_{<start-time>}.jsonl
python convert.py results_{<start-time>}.jsonl
```
> This will write to a Markdown file `rendered_results_{<start-time>}.md`


---

Expand All @@ -20,6 +18,5 @@ python render.py results_{<start-time>}.jsonl
Each color in the annotated screenshots represents a different selection method:

- 🔴 Red: Manual user selection
- 🔵 Blue: Original locatr (without reranker)
- 🟡 Yellow: Original locatr (with reranker)
- 🔵 Blue: Original locatr (with reranker)
- 🟢 Green: Anthropic grounding locatr
163 changes: 163 additions & 0 deletions eval/interactive/convert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
import json
import sys
import csv
from pathlib import Path
from collections import defaultdict


def format_points(points):
"""Format list of points into a semicolon-separated string."""
if not points:
return ""
return ";".join(f"({p['X']:.2f},{p['Y']:.2f})" for p in points)


def jsonl_to_csv(input_file, output_file):
"""Convert JSONL file to CSV format."""
# Define CSV headers
headers = [
"Url",
"ScrollCoordinates_X",
"ScrollCoordinates_Y",
"ElementDescription",
"ElementCoordinates_X",
"ElementCoordinates_Y",
"OriginalLocatr_InputTokens",
"OriginalLocatr_OutputTokens",
"OriginalLocatr_TotalTokens",
"OriginalLocatr_CostInDollars",
"OriginalLocatr_GeneratedPoints",
"AnthropicGroundingLocatr_InputTokens",
"AnthropicGroundingLocatr_OutputTokens",
"AnthropicGroundingLocatr_TotalTokens",
"AnthropicGroundingLocatr_CostInDollars",
"AnthropicGroundingLocatr_GeneratedPoints",
"ImagePath",
]

with open(output_file, "w", newline="") as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=headers)
writer.writeheader()

with open(input_file, "r") as jsonl:
for line in jsonl:
entry = json.loads(line.strip())

# Prepare the CSV row
row = {
"Url": entry["Url"],
"ScrollCoordinates_X": entry["ScrollCoordinates"]["X"],
"ScrollCoordinates_Y": entry["ScrollCoordinates"]["Y"],
"ElementDescription": entry["ElementDescription"],
"ElementCoordinates_X": entry["ElementCoordinates"]["X"],
"ElementCoordinates_Y": entry["ElementCoordinates"]["Y"],
"ImagePath": entry.get("ImagePath", ""),
}

# Add data for each model
for model in ["originalLocatr", "anthropicGroundingLocatr"]:
prefix = model.replace(
"originalLocatr", "OriginalLocatr"
).replace(
"anthropicGroundingLocatr", "AnthropicGroundingLocatr"
)
if model in entry["Outputs"] and entry["Outputs"][model]:
output = entry["Outputs"][model]
row.update(
{
f"{prefix}_InputTokens": output["InputTokens"],
f"{prefix}_OutputTokens": output[
"OutputTokens"
],
f"{prefix}_TotalTokens": output["TotalTokens"],
f"{prefix}_CostInDollars": output[
"CostInDollars"
],
f"{prefix}_GeneratedPoints": format_points(
output.get("GeneratedPoints", [])
),
}
)
else:
# Fill with empty values if model data is missing
row.update(
{
f"{prefix}_InputTokens": "",
f"{prefix}_OutputTokens": "",
f"{prefix}_TotalTokens": "",
f"{prefix}_CostInDollars": "",
f"{prefix}_GeneratedPoints": "",
}
)

writer.writerow(row)


def jsonl_to_markdown(input_file, output_file):
"""Convert JSONL file to Markdown format with URL grouping."""

# First, group entries by URL
url_entries = defaultdict(list)
with open(input_file, "r") as f_in:
for line in f_in:
entry = json.loads(line.strip())
url_entries[entry["Url"]].append(entry)

with open(output_file, "w") as f_out:
for url, entries in url_entries.items():
# Write URL as main heading
f_out.write(f"# URL: {url}\n\n")

# Process each entry for this URL
for i, entry in enumerate(entries, 1):
f_out.write(f"## Entry {i}\n\n")

f_out.write(
f"**Description**: {entry['ElementDescription']}\n\n"
)
f_out.write(
f"**Coordinates**: X={entry['ElementCoordinates']['X']}, Y={entry['ElementCoordinates']['Y']}\n\n"
)
f_out.write(
f"**Scroll To**: X={entry['ScrollCoordinates']['X']}, Y={entry['ScrollCoordinates']['Y']}\n\n"
)

for model, details in entry["Outputs"].items():
f_out.write(f"#### `{model}`\n")

# List all generated points
f_out.write("- Generated Points:\n")
for j, point in enumerate(details["GeneratedPoints"], 1):
f_out.write(
f" - Point {j}: X={point['X']}, Y={point['Y']}\n"
)
f_out.write("\n")

f_out.write(f"- Input Tokens: {details['InputTokens']}\n")
f_out.write(f"- Output Tokens: {details['OutputTokens']}\n")
f_out.write(f"- Total Tokens: {details['TotalTokens']}\n")
f_out.write(
f"- Cost in Dollars: {details['CostInDollars']}\n\n"
)

# Write image path if exists
if "ImagePath" in entry:
f_out.write("### Annotated Screenshot\n")
f_out.write(f"![Screenshot]({entry['ImagePath']})\n\n")

# Add horizontal line between URLs
f_out.write("---\n\n")


if __name__ == "__main__":
input_file = Path(sys.argv[1]).with_suffix(".jsonl")

# Create both markdown and CSV outputs
markdown_output = input_file.with_suffix(".md")
csv_output = input_file.with_suffix(".csv")

jsonl_to_markdown(input_file, markdown_output)
jsonl_to_csv(input_file, csv_output)
print("Conversion complete. Outputs written to:")
print(f"- Markdown: {markdown_output}")
print(f"- CSV: {csv_output}")
77 changes: 55 additions & 22 deletions eval/interactive/evaluator.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"image/draw"
"image/jpeg"
"log"
"log/slog"
"os"
"os/signal"
"regexp"
Expand Down Expand Up @@ -68,6 +69,19 @@ type Output struct {
InputTokens int64
OutputTokens int64
TotalTokens int64
CostInDollars float64
}

func calculateCost(inputTokens, outputTokens int, costPer1MInputTokens, costPer1MOutputTokens float64) float64 {
fmt.Printf("cost per 1 mil input tokens: %.5f\n", costPer1MInputTokens)
fmt.Printf("cost per 1 mil output tokens: %.5f\n", costPer1MOutputTokens)

// Convert to float64 before division
inputCost := (float64(inputTokens) / 1000000.0) * costPer1MInputTokens
outputCost := (float64(outputTokens) / 1000000.0) * costPer1MOutputTokens

fmt.Printf("Input cost: %.5f, Output cost: %.5f\n\n", inputCost, outputCost)
return inputCost + outputCost
}

type Captured struct {
Expand All @@ -82,7 +96,9 @@ type LocatrInterface interface {
}

type OriginalLocatr struct {
instance *playwrightLocatr.PlaywrightLocator
instance *playwrightLocatr.PlaywrightLocator
costPer1MInputTokens float64
costPer1MOutputTokens float64
}

func (l OriginalLocatr) call(page *playwright.Page, query string) (*Output, error) {
Expand All @@ -98,13 +114,19 @@ func (l OriginalLocatr) call(page *playwright.Page, query string) (*Output, erro
InputTokens: int64(lastResult.InputTokens),
OutputTokens: int64(lastResult.OutputTokens),
TotalTokens: int64(lastResult.TotalTokens),
CostInDollars: calculateCost(
lastResult.InputTokens,
lastResult.OutputTokens,
l.costPer1MInputTokens,
l.costPer1MOutputTokens,
),
}

if len(lastResult.AllLocatrs) == 0 {
return &output, nil
}

points := make([]Point, len(lastResult.AllLocatrs))
points := []Point{}
appended := false
viewportSize := (*page).ViewportSize()

Expand All @@ -116,6 +138,7 @@ func (l OriginalLocatr) call(page *playwright.Page, query string) (*Output, erro
X := bbox.X + bbox.Width/2
Y := bbox.Y + bbox.Height/2

fmt.Printf("X: %.2f, Y: %.2f\n", X, Y)
// Check if the element is within the viewport
if X < 0 || Y < 0 ||
bbox.X > float64(viewportSize.Width) || bbox.Y > float64(viewportSize.Height) {
Expand All @@ -136,7 +159,9 @@ func (l OriginalLocatr) call(page *playwright.Page, query string) (*Output, erro
const ANTHROPIC_GROUNDING_INSTRUCTION string = `Given the screen resolution of 1280x800, identify the exact (X, Y) coordinates for the described area, element, or object on a browser GUI screen.`

type AnthropicGroundingLocatr struct {
client *anthropic.Client
client *anthropic.Client
costPer1MInputTokens float64
costPer1MOutputTokens float64
}

func (l AnthropicGroundingLocatr) call(page *playwright.Page, query string) (*Output, error) {
Expand All @@ -152,7 +177,6 @@ func (l AnthropicGroundingLocatr) call(page *playwright.Page, query string) (*Ou
if err != nil {
return nil, err
}
fmt.Printf("Tool input schema: %v\n", toolInputSchema)
anthropicToolInputSchema := anthropic.BetaToolInputSchemaParam{
Type: anthropic.F(anthropic.BetaToolInputSchemaTypeObject),
Properties: anthropic.F(toolInputSchema["properties"]),
Expand All @@ -161,7 +185,7 @@ func (l AnthropicGroundingLocatr) call(page *playwright.Page, query string) (*Ou
response, err := l.client.Beta.Messages.New(
context.TODO(),
anthropic.BetaMessageNewParams{
Model: anthropic.F(anthropic.ModelClaude3_5SonnetLatest),
Model: anthropic.F(anthropic.ModelClaude3_5Sonnet20241022),
MaxTokens: anthropic.F(int64(1024)),
Messages: anthropic.F([]anthropic.BetaMessageParam{
{
Expand Down Expand Up @@ -215,6 +239,12 @@ func (l AnthropicGroundingLocatr) call(page *playwright.Page, query string) (*Ou
InputTokens: response.Usage.InputTokens,
OutputTokens: response.Usage.OutputTokens,
TotalTokens: response.Usage.InputTokens + response.Usage.OutputTokens,
CostInDollars: calculateCost(
int(response.Usage.InputTokens),
int(response.Usage.OutputTokens),
l.costPer1MInputTokens,
l.costPer1MOutputTokens,
),
}
content := response.Content[0]

Expand Down Expand Up @@ -259,7 +289,7 @@ func (l AnthropicGroundingLocatr) call(page *playwright.Page, query string) (*Ou

}

func getOriginalLocatrInstance(page *playwright.Page, rerank bool) *playwrightLocatr.PlaywrightLocator {
func getOriginalLocatrInstance(page *playwright.Page) *playwrightLocatr.PlaywrightLocator {
llmClient, err := llm.NewLlmClient(
llm.OpenAI, "gpt-4o", os.Getenv("OPENAI_API_KEY"),
)
Expand All @@ -268,10 +298,8 @@ func getOriginalLocatrInstance(page *playwright.Page, rerank bool) *playwrightLo
return nil
}
locatrOptions := locatr.BaseLocatrOptions{LlmClient: llmClient}
if rerank {
reRankClient := reranker.NewCohereClient(os.Getenv("COHERE_API_KEY"))
locatrOptions.ReRankClient = reRankClient
}
reRankClient := reranker.NewCohereClient(os.Getenv("COHERE_API_KEY"))
locatrOptions.ReRankClient = reRankClient
return playwrightLocatr.NewPlaywrightLocatr(*page, locatrOptions)
}

Expand Down Expand Up @@ -415,16 +443,20 @@ func processURLs(urls []string) error {
return fmt.Errorf("failed to unmarshal captured elements: %v", err)
}

originalLocatrWithoutReranking := OriginalLocatr{instance: getOriginalLocatrInstance(&page, false)} // without Reranker
originalLocatrWithReranking := OriginalLocatr{instance: getOriginalLocatrInstance(&page, true)}
originalLocatr := OriginalLocatr{
instance: getOriginalLocatrInstance(&page),
costPer1MInputTokens: 2.5,
costPer1MOutputTokens: 10.0,
}
anthropicGroundingLocatr := AnthropicGroundingLocatr{
client: anthropic.NewClient(option.WithAPIKey(os.Getenv("ANTHROPIC_API_KEY"))),
client: anthropic.NewClient(option.WithAPIKey(os.Getenv("ANTHROPIC_API_KEY"))),
costPer1MInputTokens: 3.0,
costPer1MOutputTokens: 15.0,
}

redColor := color.RGBA{R: 255, G: 0, B: 0, A: 255} // Selected manually
blueColor := color.RGBA{R: 0, G: 0, B: 255, A: 255} // Generated by original Locator (without reranking)
yellowColor := color.RGBA{R: 255, G: 255, B: 0, A: 255} // Generated by original Locator (with reranking)
greenColor := color.RGBA{R: 0, G: 255, B: 0, A: 255} // Generated by anthropic grounding locatr
redColor := color.RGBA{R: 255, G: 0, B: 0, A: 255} // Selected manually
blueColor := color.RGBA{R: 0, G: 0, B: 255, A: 255} // Generated by original Locator (with reranking)
greenColor := color.RGBA{R: 0, G: 255, B: 0, A: 255} // Generated by anthropic grounding locatr

// Process Captured Elements
for _, elem := range captured {
Expand All @@ -438,7 +470,8 @@ func processURLs(urls []string) error {

// Wait until scroll reaches the desired point
if _, err := page.WaitForFunction(
`([X, Y]) => window.scrollX == X && window.scrollY == Y`, []float64{scrollCoords.X, scrollCoords.Y},
`([X, Y]) =>Math.round(window.scrollX) == Math.round(X) && Math.round(window.scrollY) == Math.round(Y)`,
[]float64{scrollCoords.X, scrollCoords.Y},
); err != nil {
return fmt.Errorf("scroll verification failed: %v", err)
}
Expand All @@ -459,7 +492,7 @@ func processURLs(urls []string) error {
log.Fatalf("Failed to decode image: %v", err)
}

drawPoints(img, &[]Point{elem.ElementCoordinates}, redColor, 14)
drawPoints(img, &[]Point{elem.ElementCoordinates}, redColor, 15)
outputs := make(map[string]*Output, 3)

call := func(locatr LocatrInterface, name string, color color.Color, radius int) {
Expand All @@ -479,9 +512,8 @@ func processURLs(urls []string) error {
}
}

call(originalLocatrWithoutReranking, "originalLocatrWithoutReranking", blueColor, 12)
call(originalLocatrWithReranking, "originalLocatrWithReranking", yellowColor, 10)
call(anthropicGroundingLocatr, "anthropicGroundingLocatr", greenColor, 8)
call(originalLocatr, "originalLocatr", blueColor, 12)
call(anthropicGroundingLocatr, "anthropicGroundingLocatr", greenColor, 9)

// Convert back to bytes
finalBytes, err := imageToBytes(img)
Expand Down Expand Up @@ -549,6 +581,7 @@ func loadURLs(inputPath string) ([]string, error) {

func main() {
// Load environment variables
logger.Level.Set(slog.LevelDebug)
if err := godotenv.Load(); err != nil {
log.Println("Error loading .env file")
}
Expand Down
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
Loading