From 308972aa0f7aff2913dfa5df0f00be2d65f9c0be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 18 Feb 2025 11:02:39 +0100 Subject: [PATCH 1/2] fix(v1/types): fix extract -> json rename --- .../src/__tests__/snips/batch-scrape.test.ts | 92 +++++++++++ apps/api/src/__tests__/snips/scrape.test.ts | 74 ++++++--- apps/api/src/controllers/v1/types.ts | 147 ++++++++---------- 3 files changed, 212 insertions(+), 101 deletions(-) create mode 100644 apps/api/src/__tests__/snips/batch-scrape.test.ts diff --git a/apps/api/src/__tests__/snips/batch-scrape.test.ts b/apps/api/src/__tests__/snips/batch-scrape.test.ts new file mode 100644 index 0000000000..59c9da2ec5 --- /dev/null +++ b/apps/api/src/__tests__/snips/batch-scrape.test.ts @@ -0,0 +1,92 @@ +import request from "supertest"; +import { configDotenv } from "dotenv"; +import { BatchScrapeRequestInput } from "../../controllers/v1/types"; + +configDotenv(); +const TEST_URL = "http://127.0.0.1:3002"; + +async function batchScrapeStart(body: BatchScrapeRequestInput) { + return await request(TEST_URL) + .post("/v1/batch/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(body); +} + +async function batchScrapeStatus(id: string) { + return await request(TEST_URL) + .get("/v1/batch/scrape/" + encodeURIComponent(id)) + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .send(); +} + +async function batchScrape(body: BatchScrapeRequestInput): ReturnType { + const bss = await batchScrapeStart(body); + expectBatchScrapeStartToSucceed(bss); + + let x; + + do { + x = await batchScrapeStatus(bss.body.id); + expect(x.statusCode).toBe(200); + expect(typeof x.body.status).toBe("string"); + } while (x.body.status !== "completed") + + expectBatchScrapeToSucceed(x); + return x; +} + +function expectBatchScrapeStartToSucceed(response: Awaited>) { + expect(response.statusCode).toBe(200); + expect(response.body.success).toBe(true); + expect(typeof response.body.id).toBe("string"); +} + +function expectBatchScrapeToSucceed(response: Awaited>) { + expect(response.statusCode).toBe(200); + expect(response.body.success).toBe(true); + expect(typeof response.body.status).toBe("string"); + expect(response.body.status).toBe("completed"); + expect(response.body).toHaveProperty("data"); + expect(Array.isArray(response.body.data)).toBe(true); + expect(response.body.data.length).toBeGreaterThan(0); +} + +describe("Batch scrape tests", () => { + describe("JSON format", () => { + it.concurrent("works", async () => { + const response = await batchScrape({ + urls: ["http://firecrawl.dev"], + formats: ["json"], + jsonOptions: { + prompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source.", + schema: { + type: "object", + properties: { + company_mission: { + type: "string", + }, + supports_sso: { + type: "boolean", + }, + is_open_source: { + type: "boolean", + }, + }, + required: ["company_mission", "supports_sso", "is_open_source"], + }, + }, + }); + + expect(response.body.data[0]).toHaveProperty("json"); + expect(response.body.data[0].json).toHaveProperty("company_mission"); + expect(typeof response.body.data[0].json.company_mission).toBe("string"); + expect(response.body.data[0].json).toHaveProperty("supports_sso"); + expect(response.body.data[0].json.supports_sso).toBe(false); + expect(typeof response.body.data[0].json.supports_sso).toBe("boolean"); + expect(response.body.data[0].json).toHaveProperty("is_open_source"); + expect(response.body.data[0].json.is_open_source).toBe(true); + expect(typeof response.body.data[0].json.is_open_source).toBe("boolean"); + }, 30000); + }); +}); diff --git a/apps/api/src/__tests__/snips/scrape.test.ts b/apps/api/src/__tests__/snips/scrape.test.ts index 7970d820b3..88443a2d2b 100644 --- a/apps/api/src/__tests__/snips/scrape.test.ts +++ b/apps/api/src/__tests__/snips/scrape.test.ts @@ -1,11 +1,11 @@ import request from "supertest"; import { configDotenv } from "dotenv"; -import { ScrapeRequestInput } from "../../controllers/v1/types"; +import { Document, ScrapeRequestInput } from "../../controllers/v1/types"; configDotenv(); const TEST_URL = "http://127.0.0.1:3002"; -async function scrape(body: ScrapeRequestInput) { +async function scrapeRaw(body: ScrapeRequestInput) { return await request(TEST_URL) .post("/v1/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) @@ -13,12 +13,18 @@ async function scrape(body: ScrapeRequestInput) { .send(body); } -function expectScrapeToSucceed(response: Awaited>) { +function expectScrapeToSucceed(response: Awaited>) { expect(response.statusCode).toBe(200); expect(response.body.success).toBe(true); expect(typeof response.body.data).toBe("object"); } +async function scrape(body: ScrapeRequestInput): Promise { + const raw = await scrapeRaw(body); + expectScrapeToSucceed(raw); + return raw.body.data; +} + describe("Scrape tests", () => { it("mocking works properly", async () => { // depends on falsified mock mocking-works-properly @@ -30,8 +36,7 @@ describe("Scrape tests", () => { useMock: "mocking-works-properly", }); - expectScrapeToSucceed(response); - expect(response.body.data.markdown).toBe( + expect(response.markdown).toBe( "this is fake data coming from the mocking system!", ); }, 10000); @@ -42,8 +47,7 @@ describe("Scrape tests", () => { url: "https://canyoublockit.com/testing/", }); - expectScrapeToSucceed(response); - expect(response.body.data.markdown).not.toContain(".g.doubleclick.net/"); + expect(response.markdown).not.toContain(".g.doubleclick.net/"); }, 10000); it.concurrent("doesn't block ads if explicitly disabled", async () => { @@ -52,8 +56,7 @@ describe("Scrape tests", () => { blockAds: false, }); - expectScrapeToSucceed(response); - expect(response.body.data.markdown).toContain(".g.doubleclick.net/"); + expect(response.markdown).toContain(".g.doubleclick.net/"); }, 10000); }); @@ -62,8 +65,6 @@ describe("Scrape tests", () => { const response = await scrape({ url: "https://iplocation.com", }); - - expectScrapeToSucceed(response); }, 10000); it.concurrent("works with country US", async () => { @@ -72,8 +73,7 @@ describe("Scrape tests", () => { location: { country: "US" }, }); - expectScrapeToSucceed(response); - expect(response.body.data.markdown).toContain("| Country | United States |"); + expect(response.markdown).toContain("| Country | United States |"); }, 10000); }); @@ -84,8 +84,7 @@ describe("Scrape tests", () => { formats: ["rawHtml"], }); - expectScrapeToSucceed(response); - const obj = JSON.parse(response.body.data.rawHtml); + const obj = JSON.parse(response.rawHtml!); expect(obj.id).toBe(1); }, 25000); // TODO: mock and shorten }); @@ -97,8 +96,7 @@ describe("Scrape tests", () => { formats: ["screenshot"] }); - expectScrapeToSucceed(response); - expect(typeof response.body.data.screenshot).toBe("string"); + expect(typeof response.screenshot).toBe("string"); }, 15000); it.concurrent("screenshot@fullPage format works", async () => { @@ -107,8 +105,44 @@ describe("Scrape tests", () => { formats: ["screenshot@fullPage"] }); - expectScrapeToSucceed(response); - expect(typeof response.body.data.screenshot).toBe("string"); + expect(typeof response.screenshot).toBe("string"); }, 15000); - }) + }); + + describe("JSON format", () => { + it.concurrent("works", async () => { + const response = await scrape({ + url: "http://firecrawl.dev", + formats: ["json"], + jsonOptions: { + prompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source.", + schema: { + type: "object", + properties: { + company_mission: { + type: "string", + }, + supports_sso: { + type: "boolean", + }, + is_open_source: { + type: "boolean", + }, + }, + required: ["company_mission", "supports_sso", "is_open_source"], + }, + }, + }); + + expect(response).toHaveProperty("json"); + expect(response.json).toHaveProperty("company_mission"); + expect(typeof response.json.company_mission).toBe("string"); + expect(response.json).toHaveProperty("supports_sso"); + expect(response.json.supports_sso).toBe(false); + expect(typeof response.json.supports_sso).toBe("boolean"); + expect(response.json).toHaveProperty("is_open_source"); + expect(response.json.is_open_source).toBe(true); + expect(typeof response.json.is_open_source).toBe("boolean"); + }, 30000); + }); }); diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 858c792ff0..41e766d989 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -220,6 +220,54 @@ const baseScrapeOptions = z }) .strict(strictMessage); +const extractRefine = (obj) => { + const hasExtractFormat = obj.formats?.includes("extract"); + const hasExtractOptions = obj.extract !== undefined; + const hasJsonFormat = obj.formats?.includes("json"); + const hasJsonOptions = obj.jsonOptions !== undefined; + return ( + (hasExtractFormat && hasExtractOptions) || + (!hasExtractFormat && !hasExtractOptions) || + (hasJsonFormat && hasJsonOptions) || + (!hasJsonFormat && !hasJsonOptions) + ); +}; +const extractRefineOpts = { + message: + "When 'extract' or 'json' format is specified, corresponding options must be provided, and vice versa", +}; +const extractTransform = (obj) => { + // Handle timeout + if ( + (obj.formats?.includes("extract") || + obj.extract || + obj.formats?.includes("json") || + obj.jsonOptions) && + !obj.timeout + ) { + obj = { ...obj, timeout: 60000 }; + } + + if (obj.formats?.includes("json")) { + obj.formats.push("extract"); + } + + // Convert JSON options to extract options if needed + if (obj.jsonOptions && !obj.extract) { + obj = { + ...obj, + extract: { + prompt: obj.jsonOptions.prompt, + systemPrompt: obj.jsonOptions.systemPrompt, + schema: obj.jsonOptions.schema, + mode: "llm", + }, + }; + } + + return obj; +} + export const scrapeOptions = baseScrapeOptions.refine( (obj) => { if (!obj.actions) return true; @@ -228,7 +276,8 @@ export const scrapeOptions = baseScrapeOptions.refine( { message: `Total wait time (waitFor + wait actions) cannot exceed ${ACTIONS_MAX_WAIT_TIME} seconds`, } -); +).refine(extractRefine, extractRefineOpts) +.transform(extractRefine); export type ScrapeOptions = z.infer; @@ -280,7 +329,9 @@ export const extractV1Options = z .transform((obj) => ({ ...obj, allowExternalLinks: obj.allowExternalLinks || obj.enableWebSearch, - })); + })) + .refine(x => extractRefine(x.scrapeOptions), extractRefineOpts) + .transform(x => extractTransform(x.scrapeOptions)); export type ExtractV1Options = z.infer; export const extractRequestSchema = extractV1Options; @@ -294,55 +345,8 @@ export const scrapeRequestSchema = baseScrapeOptions timeout: z.number().int().positive().finite().safe().default(30000), }) .strict(strictMessage) - .refine( - (obj) => { - const hasExtractFormat = obj.formats?.includes("extract"); - const hasExtractOptions = obj.extract !== undefined; - const hasJsonFormat = obj.formats?.includes("json"); - const hasJsonOptions = obj.jsonOptions !== undefined; - return ( - (hasExtractFormat && hasExtractOptions) || - (!hasExtractFormat && !hasExtractOptions) || - (hasJsonFormat && hasJsonOptions) || - (!hasJsonFormat && !hasJsonOptions) - ); - }, - { - message: - "When 'extract' or 'json' format is specified, corresponding options must be provided, and vice versa", - }, - ) - .transform((obj) => { - // Handle timeout - if ( - (obj.formats?.includes("extract") || - obj.extract || - obj.formats?.includes("json") || - obj.jsonOptions) && - !obj.timeout - ) { - obj = { ...obj, timeout: 60000 }; - } - - if (obj.formats?.includes("json")) { - obj.formats.push("extract"); - } - - // Convert JSON options to extract options if needed - if (obj.jsonOptions && !obj.extract) { - obj = { - ...obj, - extract: { - prompt: obj.jsonOptions.prompt, - systemPrompt: obj.jsonOptions.systemPrompt, - schema: obj.jsonOptions.schema, - mode: "llm", - }, - }; - } - - return obj; - }); + .refine(extractRefine, extractRefineOpts) + .transform(extractTransform); export type ScrapeRequest = z.infer; export type ScrapeRequestInput = z.input; @@ -374,20 +378,8 @@ export const batchScrapeRequestSchema = baseScrapeOptions ignoreInvalidURLs: z.boolean().default(false), }) .strict(strictMessage) - .refine( - (obj) => { - const hasExtractFormat = obj.formats?.includes("extract"); - const hasExtractOptions = obj.extract !== undefined; - return ( - (hasExtractFormat && hasExtractOptions) || - (!hasExtractFormat && !hasExtractOptions) - ); - }, - { - message: - "When 'extract' format is specified, 'extract' options must be provided, and vice versa", - }, - ); + .refine(extractRefine, extractRefineOpts) + .transform(extractTransform); export const batchScrapeRequestSchemaNoURLValidation = baseScrapeOptions .extend({ @@ -398,22 +390,11 @@ export const batchScrapeRequestSchemaNoURLValidation = baseScrapeOptions ignoreInvalidURLs: z.boolean().default(false), }) .strict(strictMessage) - .refine( - (obj) => { - const hasExtractFormat = obj.formats?.includes("extract"); - const hasExtractOptions = obj.extract !== undefined; - return ( - (hasExtractFormat && hasExtractOptions) || - (!hasExtractFormat && !hasExtractOptions) - ); - }, - { - message: - "When 'extract' format is specified, 'extract' options must be provided, and vice versa", - }, - ); + .refine(extractRefine, extractRefineOpts) + .transform(extractTransform); export type BatchScrapeRequest = z.infer; +export type BatchScrapeRequestInput = z.input; const crawlerOptions = z .object({ @@ -451,7 +432,9 @@ export const crawlRequestSchema = crawlerOptions webhook: webhookSchema.optional(), limit: z.number().default(10000), }) - .strict(strictMessage); + .strict(strictMessage) + .refine(x => extractRefine(x.scrapeOptions), extractRefineOpts) + .transform(x => extractTransform(x.scrapeOptions)); // export type CrawlRequest = { // url: string; @@ -935,7 +918,9 @@ export const searchRequestSchema = z }) .strict( "Unrecognized key in body -- please review the v1 API documentation for request body changes", - ); + ) + .refine(x => extractRefine(x.scrapeOptions), extractRefineOpts) + .transform(x => extractTransform(x.scrapeOptions)); export type SearchRequest = z.infer; From 953b688f2d26cb219d3b03bdb7f4a9343cb81432 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 18 Feb 2025 11:38:26 +0100 Subject: [PATCH 2/2] fix(types/v1): bad transform --- apps/api/src/controllers/v1/types.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 41e766d989..06ae58f757 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -277,7 +277,7 @@ export const scrapeOptions = baseScrapeOptions.refine( message: `Total wait time (waitFor + wait actions) cannot exceed ${ACTIONS_MAX_WAIT_TIME} seconds`, } ).refine(extractRefine, extractRefineOpts) -.transform(extractRefine); +.transform(extractTransform); export type ScrapeOptions = z.infer;