diff --git a/apps/api/src/__tests__/snips/batch-scrape.test.ts b/apps/api/src/__tests__/snips/batch-scrape.test.ts deleted file mode 100644 index 59c9da2ec5..0000000000 --- a/apps/api/src/__tests__/snips/batch-scrape.test.ts +++ /dev/null @@ -1,92 +0,0 @@ -import request from "supertest"; -import { configDotenv } from "dotenv"; -import { BatchScrapeRequestInput } from "../../controllers/v1/types"; - -configDotenv(); -const TEST_URL = "http://127.0.0.1:3002"; - -async function batchScrapeStart(body: BatchScrapeRequestInput) { - return await request(TEST_URL) - .post("/v1/batch/scrape") - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .set("Content-Type", "application/json") - .send(body); -} - -async function batchScrapeStatus(id: string) { - return await request(TEST_URL) - .get("/v1/batch/scrape/" + encodeURIComponent(id)) - .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) - .send(); -} - -async function batchScrape(body: BatchScrapeRequestInput): ReturnType { - const bss = await batchScrapeStart(body); - expectBatchScrapeStartToSucceed(bss); - - let x; - - do { - x = await batchScrapeStatus(bss.body.id); - expect(x.statusCode).toBe(200); - expect(typeof x.body.status).toBe("string"); - } while (x.body.status !== "completed") - - expectBatchScrapeToSucceed(x); - return x; -} - -function expectBatchScrapeStartToSucceed(response: Awaited>) { - expect(response.statusCode).toBe(200); - expect(response.body.success).toBe(true); - expect(typeof response.body.id).toBe("string"); -} - -function expectBatchScrapeToSucceed(response: Awaited>) { - expect(response.statusCode).toBe(200); - expect(response.body.success).toBe(true); - expect(typeof response.body.status).toBe("string"); - expect(response.body.status).toBe("completed"); - expect(response.body).toHaveProperty("data"); - expect(Array.isArray(response.body.data)).toBe(true); - expect(response.body.data.length).toBeGreaterThan(0); -} - -describe("Batch scrape tests", () => { - describe("JSON format", () => { - it.concurrent("works", async () => { - const response = await batchScrape({ - urls: ["http://firecrawl.dev"], - formats: ["json"], - jsonOptions: { - prompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source.", - schema: { - type: "object", - properties: { - company_mission: { - type: "string", - }, - supports_sso: { - type: "boolean", - }, - is_open_source: { - type: "boolean", - }, - }, - required: ["company_mission", "supports_sso", "is_open_source"], - }, - }, - }); - - expect(response.body.data[0]).toHaveProperty("json"); - expect(response.body.data[0].json).toHaveProperty("company_mission"); - expect(typeof response.body.data[0].json.company_mission).toBe("string"); - expect(response.body.data[0].json).toHaveProperty("supports_sso"); - expect(response.body.data[0].json.supports_sso).toBe(false); - expect(typeof response.body.data[0].json.supports_sso).toBe("boolean"); - expect(response.body.data[0].json).toHaveProperty("is_open_source"); - expect(response.body.data[0].json.is_open_source).toBe(true); - expect(typeof response.body.data[0].json.is_open_source).toBe("boolean"); - }, 30000); - }); -}); diff --git a/apps/api/src/__tests__/snips/scrape.test.ts b/apps/api/src/__tests__/snips/scrape.test.ts index 88443a2d2b..7970d820b3 100644 --- a/apps/api/src/__tests__/snips/scrape.test.ts +++ b/apps/api/src/__tests__/snips/scrape.test.ts @@ -1,11 +1,11 @@ import request from "supertest"; import { configDotenv } from "dotenv"; -import { Document, ScrapeRequestInput } from "../../controllers/v1/types"; +import { ScrapeRequestInput } from "../../controllers/v1/types"; configDotenv(); const TEST_URL = "http://127.0.0.1:3002"; -async function scrapeRaw(body: ScrapeRequestInput) { +async function scrape(body: ScrapeRequestInput) { return await request(TEST_URL) .post("/v1/scrape") .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) @@ -13,18 +13,12 @@ async function scrapeRaw(body: ScrapeRequestInput) { .send(body); } -function expectScrapeToSucceed(response: Awaited>) { +function expectScrapeToSucceed(response: Awaited>) { expect(response.statusCode).toBe(200); expect(response.body.success).toBe(true); expect(typeof response.body.data).toBe("object"); } -async function scrape(body: ScrapeRequestInput): Promise { - const raw = await scrapeRaw(body); - expectScrapeToSucceed(raw); - return raw.body.data; -} - describe("Scrape tests", () => { it("mocking works properly", async () => { // depends on falsified mock mocking-works-properly @@ -36,7 +30,8 @@ describe("Scrape tests", () => { useMock: "mocking-works-properly", }); - expect(response.markdown).toBe( + expectScrapeToSucceed(response); + expect(response.body.data.markdown).toBe( "this is fake data coming from the mocking system!", ); }, 10000); @@ -47,7 +42,8 @@ describe("Scrape tests", () => { url: "https://canyoublockit.com/testing/", }); - expect(response.markdown).not.toContain(".g.doubleclick.net/"); + expectScrapeToSucceed(response); + expect(response.body.data.markdown).not.toContain(".g.doubleclick.net/"); }, 10000); it.concurrent("doesn't block ads if explicitly disabled", async () => { @@ -56,7 +52,8 @@ describe("Scrape tests", () => { blockAds: false, }); - expect(response.markdown).toContain(".g.doubleclick.net/"); + expectScrapeToSucceed(response); + expect(response.body.data.markdown).toContain(".g.doubleclick.net/"); }, 10000); }); @@ -65,6 +62,8 @@ describe("Scrape tests", () => { const response = await scrape({ url: "https://iplocation.com", }); + + expectScrapeToSucceed(response); }, 10000); it.concurrent("works with country US", async () => { @@ -73,7 +72,8 @@ describe("Scrape tests", () => { location: { country: "US" }, }); - expect(response.markdown).toContain("| Country | United States |"); + expectScrapeToSucceed(response); + expect(response.body.data.markdown).toContain("| Country | United States |"); }, 10000); }); @@ -84,7 +84,8 @@ describe("Scrape tests", () => { formats: ["rawHtml"], }); - const obj = JSON.parse(response.rawHtml!); + expectScrapeToSucceed(response); + const obj = JSON.parse(response.body.data.rawHtml); expect(obj.id).toBe(1); }, 25000); // TODO: mock and shorten }); @@ -96,7 +97,8 @@ describe("Scrape tests", () => { formats: ["screenshot"] }); - expect(typeof response.screenshot).toBe("string"); + expectScrapeToSucceed(response); + expect(typeof response.body.data.screenshot).toBe("string"); }, 15000); it.concurrent("screenshot@fullPage format works", async () => { @@ -105,44 +107,8 @@ describe("Scrape tests", () => { formats: ["screenshot@fullPage"] }); - expect(typeof response.screenshot).toBe("string"); + expectScrapeToSucceed(response); + expect(typeof response.body.data.screenshot).toBe("string"); }, 15000); - }); - - describe("JSON format", () => { - it.concurrent("works", async () => { - const response = await scrape({ - url: "http://firecrawl.dev", - formats: ["json"], - jsonOptions: { - prompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source.", - schema: { - type: "object", - properties: { - company_mission: { - type: "string", - }, - supports_sso: { - type: "boolean", - }, - is_open_source: { - type: "boolean", - }, - }, - required: ["company_mission", "supports_sso", "is_open_source"], - }, - }, - }); - - expect(response).toHaveProperty("json"); - expect(response.json).toHaveProperty("company_mission"); - expect(typeof response.json.company_mission).toBe("string"); - expect(response.json).toHaveProperty("supports_sso"); - expect(response.json.supports_sso).toBe(false); - expect(typeof response.json.supports_sso).toBe("boolean"); - expect(response.json).toHaveProperty("is_open_source"); - expect(response.json.is_open_source).toBe(true); - expect(typeof response.json.is_open_source).toBe("boolean"); - }, 30000); - }); + }) }); diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 06ae58f757..858c792ff0 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -220,54 +220,6 @@ const baseScrapeOptions = z }) .strict(strictMessage); -const extractRefine = (obj) => { - const hasExtractFormat = obj.formats?.includes("extract"); - const hasExtractOptions = obj.extract !== undefined; - const hasJsonFormat = obj.formats?.includes("json"); - const hasJsonOptions = obj.jsonOptions !== undefined; - return ( - (hasExtractFormat && hasExtractOptions) || - (!hasExtractFormat && !hasExtractOptions) || - (hasJsonFormat && hasJsonOptions) || - (!hasJsonFormat && !hasJsonOptions) - ); -}; -const extractRefineOpts = { - message: - "When 'extract' or 'json' format is specified, corresponding options must be provided, and vice versa", -}; -const extractTransform = (obj) => { - // Handle timeout - if ( - (obj.formats?.includes("extract") || - obj.extract || - obj.formats?.includes("json") || - obj.jsonOptions) && - !obj.timeout - ) { - obj = { ...obj, timeout: 60000 }; - } - - if (obj.formats?.includes("json")) { - obj.formats.push("extract"); - } - - // Convert JSON options to extract options if needed - if (obj.jsonOptions && !obj.extract) { - obj = { - ...obj, - extract: { - prompt: obj.jsonOptions.prompt, - systemPrompt: obj.jsonOptions.systemPrompt, - schema: obj.jsonOptions.schema, - mode: "llm", - }, - }; - } - - return obj; -} - export const scrapeOptions = baseScrapeOptions.refine( (obj) => { if (!obj.actions) return true; @@ -276,8 +228,7 @@ export const scrapeOptions = baseScrapeOptions.refine( { message: `Total wait time (waitFor + wait actions) cannot exceed ${ACTIONS_MAX_WAIT_TIME} seconds`, } -).refine(extractRefine, extractRefineOpts) -.transform(extractTransform); +); export type ScrapeOptions = z.infer; @@ -329,9 +280,7 @@ export const extractV1Options = z .transform((obj) => ({ ...obj, allowExternalLinks: obj.allowExternalLinks || obj.enableWebSearch, - })) - .refine(x => extractRefine(x.scrapeOptions), extractRefineOpts) - .transform(x => extractTransform(x.scrapeOptions)); + })); export type ExtractV1Options = z.infer; export const extractRequestSchema = extractV1Options; @@ -345,8 +294,55 @@ export const scrapeRequestSchema = baseScrapeOptions timeout: z.number().int().positive().finite().safe().default(30000), }) .strict(strictMessage) - .refine(extractRefine, extractRefineOpts) - .transform(extractTransform); + .refine( + (obj) => { + const hasExtractFormat = obj.formats?.includes("extract"); + const hasExtractOptions = obj.extract !== undefined; + const hasJsonFormat = obj.formats?.includes("json"); + const hasJsonOptions = obj.jsonOptions !== undefined; + return ( + (hasExtractFormat && hasExtractOptions) || + (!hasExtractFormat && !hasExtractOptions) || + (hasJsonFormat && hasJsonOptions) || + (!hasJsonFormat && !hasJsonOptions) + ); + }, + { + message: + "When 'extract' or 'json' format is specified, corresponding options must be provided, and vice versa", + }, + ) + .transform((obj) => { + // Handle timeout + if ( + (obj.formats?.includes("extract") || + obj.extract || + obj.formats?.includes("json") || + obj.jsonOptions) && + !obj.timeout + ) { + obj = { ...obj, timeout: 60000 }; + } + + if (obj.formats?.includes("json")) { + obj.formats.push("extract"); + } + + // Convert JSON options to extract options if needed + if (obj.jsonOptions && !obj.extract) { + obj = { + ...obj, + extract: { + prompt: obj.jsonOptions.prompt, + systemPrompt: obj.jsonOptions.systemPrompt, + schema: obj.jsonOptions.schema, + mode: "llm", + }, + }; + } + + return obj; + }); export type ScrapeRequest = z.infer; export type ScrapeRequestInput = z.input; @@ -378,8 +374,20 @@ export const batchScrapeRequestSchema = baseScrapeOptions ignoreInvalidURLs: z.boolean().default(false), }) .strict(strictMessage) - .refine(extractRefine, extractRefineOpts) - .transform(extractTransform); + .refine( + (obj) => { + const hasExtractFormat = obj.formats?.includes("extract"); + const hasExtractOptions = obj.extract !== undefined; + return ( + (hasExtractFormat && hasExtractOptions) || + (!hasExtractFormat && !hasExtractOptions) + ); + }, + { + message: + "When 'extract' format is specified, 'extract' options must be provided, and vice versa", + }, + ); export const batchScrapeRequestSchemaNoURLValidation = baseScrapeOptions .extend({ @@ -390,11 +398,22 @@ export const batchScrapeRequestSchemaNoURLValidation = baseScrapeOptions ignoreInvalidURLs: z.boolean().default(false), }) .strict(strictMessage) - .refine(extractRefine, extractRefineOpts) - .transform(extractTransform); + .refine( + (obj) => { + const hasExtractFormat = obj.formats?.includes("extract"); + const hasExtractOptions = obj.extract !== undefined; + return ( + (hasExtractFormat && hasExtractOptions) || + (!hasExtractFormat && !hasExtractOptions) + ); + }, + { + message: + "When 'extract' format is specified, 'extract' options must be provided, and vice versa", + }, + ); export type BatchScrapeRequest = z.infer; -export type BatchScrapeRequestInput = z.input; const crawlerOptions = z .object({ @@ -432,9 +451,7 @@ export const crawlRequestSchema = crawlerOptions webhook: webhookSchema.optional(), limit: z.number().default(10000), }) - .strict(strictMessage) - .refine(x => extractRefine(x.scrapeOptions), extractRefineOpts) - .transform(x => extractTransform(x.scrapeOptions)); + .strict(strictMessage); // export type CrawlRequest = { // url: string; @@ -918,9 +935,7 @@ export const searchRequestSchema = z }) .strict( "Unrecognized key in body -- please review the v1 API documentation for request body changes", - ) - .refine(x => extractRefine(x.scrapeOptions), extractRefineOpts) - .transform(x => extractTransform(x.scrapeOptions)); + ); export type SearchRequest = z.infer;