Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(v1/types): fix extract -> json rename (FIR-1072) #1195

Merged
merged 2 commits into from
Feb 18, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 92 additions & 0 deletions apps/api/src/__tests__/snips/batch-scrape.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import request from "supertest";
import { configDotenv } from "dotenv";
import { BatchScrapeRequestInput } from "../../controllers/v1/types";

configDotenv();
const TEST_URL = "http://127.0.0.1:3002";

async function batchScrapeStart(body: BatchScrapeRequestInput) {
return await request(TEST_URL)
.post("/v1/batch/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(body);
}

async function batchScrapeStatus(id: string) {
return await request(TEST_URL)
.get("/v1/batch/scrape/" + encodeURIComponent(id))
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.send();
}

async function batchScrape(body: BatchScrapeRequestInput): ReturnType<typeof batchScrapeStatus> {
const bss = await batchScrapeStart(body);
expectBatchScrapeStartToSucceed(bss);

let x;

do {
x = await batchScrapeStatus(bss.body.id);
expect(x.statusCode).toBe(200);
expect(typeof x.body.status).toBe("string");
} while (x.body.status !== "completed")

expectBatchScrapeToSucceed(x);
return x;
}

function expectBatchScrapeStartToSucceed(response: Awaited<ReturnType<typeof batchScrape>>) {
expect(response.statusCode).toBe(200);
expect(response.body.success).toBe(true);
expect(typeof response.body.id).toBe("string");
}

function expectBatchScrapeToSucceed(response: Awaited<ReturnType<typeof batchScrapeStatus>>) {
expect(response.statusCode).toBe(200);
expect(response.body.success).toBe(true);
expect(typeof response.body.status).toBe("string");
expect(response.body.status).toBe("completed");
expect(response.body).toHaveProperty("data");
expect(Array.isArray(response.body.data)).toBe(true);
expect(response.body.data.length).toBeGreaterThan(0);
}

describe("Batch scrape tests", () => {
describe("JSON format", () => {
it.concurrent("works", async () => {
const response = await batchScrape({
urls: ["http://firecrawl.dev"],
formats: ["json"],
jsonOptions: {
prompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source.",
schema: {
type: "object",
properties: {
company_mission: {
type: "string",
},
supports_sso: {
type: "boolean",
},
is_open_source: {
type: "boolean",
},
},
required: ["company_mission", "supports_sso", "is_open_source"],
},
},
});

expect(response.body.data[0]).toHaveProperty("json");
expect(response.body.data[0].json).toHaveProperty("company_mission");
expect(typeof response.body.data[0].json.company_mission).toBe("string");
expect(response.body.data[0].json).toHaveProperty("supports_sso");
expect(response.body.data[0].json.supports_sso).toBe(false);
expect(typeof response.body.data[0].json.supports_sso).toBe("boolean");
expect(response.body.data[0].json).toHaveProperty("is_open_source");
expect(response.body.data[0].json.is_open_source).toBe(true);
expect(typeof response.body.data[0].json.is_open_source).toBe("boolean");
}, 30000);
});
});
74 changes: 54 additions & 20 deletions apps/api/src/__tests__/snips/scrape.test.ts
Original file line number Diff line number Diff line change
@@ -1,24 +1,30 @@
import request from "supertest";
import { configDotenv } from "dotenv";
import { ScrapeRequestInput } from "../../controllers/v1/types";
import { Document, ScrapeRequestInput } from "../../controllers/v1/types";

configDotenv();
const TEST_URL = "http://127.0.0.1:3002";

async function scrape(body: ScrapeRequestInput) {
async function scrapeRaw(body: ScrapeRequestInput) {
return await request(TEST_URL)
.post("/v1/scrape")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(body);
}

function expectScrapeToSucceed(response: Awaited<ReturnType<typeof scrape>>) {
function expectScrapeToSucceed(response: Awaited<ReturnType<typeof scrapeRaw>>) {
expect(response.statusCode).toBe(200);
expect(response.body.success).toBe(true);
expect(typeof response.body.data).toBe("object");
}

async function scrape(body: ScrapeRequestInput): Promise<Document> {
const raw = await scrapeRaw(body);
expectScrapeToSucceed(raw);
return raw.body.data;
}

describe("Scrape tests", () => {
it("mocking works properly", async () => {
// depends on falsified mock mocking-works-properly
Expand All @@ -30,8 +36,7 @@ describe("Scrape tests", () => {
useMock: "mocking-works-properly",
});

expectScrapeToSucceed(response);
expect(response.body.data.markdown).toBe(
expect(response.markdown).toBe(
"this is fake data coming from the mocking system!",
);
}, 10000);
Expand All @@ -42,8 +47,7 @@ describe("Scrape tests", () => {
url: "https://canyoublockit.com/testing/",
});

expectScrapeToSucceed(response);
expect(response.body.data.markdown).not.toContain(".g.doubleclick.net/");
expect(response.markdown).not.toContain(".g.doubleclick.net/");
}, 10000);

it.concurrent("doesn't block ads if explicitly disabled", async () => {
Expand All @@ -52,8 +56,7 @@ describe("Scrape tests", () => {
blockAds: false,
});

expectScrapeToSucceed(response);
expect(response.body.data.markdown).toContain(".g.doubleclick.net/");
expect(response.markdown).toContain(".g.doubleclick.net/");
}, 10000);
});

Expand All @@ -62,8 +65,6 @@ describe("Scrape tests", () => {
const response = await scrape({
url: "https://iplocation.com",
});

expectScrapeToSucceed(response);
}, 10000);

it.concurrent("works with country US", async () => {
Expand All @@ -72,8 +73,7 @@ describe("Scrape tests", () => {
location: { country: "US" },
});

expectScrapeToSucceed(response);
expect(response.body.data.markdown).toContain("| Country | United States |");
expect(response.markdown).toContain("| Country | United States |");
}, 10000);
});

Expand All @@ -84,8 +84,7 @@ describe("Scrape tests", () => {
formats: ["rawHtml"],
});

expectScrapeToSucceed(response);
const obj = JSON.parse(response.body.data.rawHtml);
const obj = JSON.parse(response.rawHtml!);
expect(obj.id).toBe(1);
}, 25000); // TODO: mock and shorten
});
Expand All @@ -97,8 +96,7 @@ describe("Scrape tests", () => {
formats: ["screenshot"]
});

expectScrapeToSucceed(response);
expect(typeof response.body.data.screenshot).toBe("string");
expect(typeof response.screenshot).toBe("string");
}, 15000);

it.concurrent("screenshot@fullPage format works", async () => {
Expand All @@ -107,8 +105,44 @@ describe("Scrape tests", () => {
formats: ["screenshot@fullPage"]
});

expectScrapeToSucceed(response);
expect(typeof response.body.data.screenshot).toBe("string");
expect(typeof response.screenshot).toBe("string");
}, 15000);
})
});

describe("JSON format", () => {
it.concurrent("works", async () => {
const response = await scrape({
url: "http://firecrawl.dev",
formats: ["json"],
jsonOptions: {
prompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source.",
schema: {
type: "object",
properties: {
company_mission: {
type: "string",
},
supports_sso: {
type: "boolean",
},
is_open_source: {
type: "boolean",
},
},
required: ["company_mission", "supports_sso", "is_open_source"],
},
},
});

expect(response).toHaveProperty("json");
expect(response.json).toHaveProperty("company_mission");
expect(typeof response.json.company_mission).toBe("string");
expect(response.json).toHaveProperty("supports_sso");
expect(response.json.supports_sso).toBe(false);
expect(typeof response.json.supports_sso).toBe("boolean");
expect(response.json).toHaveProperty("is_open_source");
expect(response.json.is_open_source).toBe(true);
expect(typeof response.json.is_open_source).toBe("boolean");
}, 30000);
});
});
Loading