-
Notifications
You must be signed in to change notification settings - Fork 29
/
scraper.ts
122 lines (99 loc) · 3.25 KB
/
scraper.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import { TRPCError } from "@trpc/server"
import {
ExtractNumberSchema,
JsonLdRecipeSchema,
RecipeImageUrlSchema,
RecipeStepSchema,
} from "~/server/api/modules/recipes/service/schemas"
import { InsertIngredient, InsertRecipe } from "~/server/db/schema"
import { JSDOM } from "jsdom"
import { logger } from "~/lib/logger"
async function getNodeListOfMetadataNodesFromUrl(url: string) {
const dom = await JSDOM.fromURL(url)
const nodeList: NodeList = dom.window.document.querySelectorAll(
"script[type='application/ld+json']"
)
if (nodeList.length === 0) {
throw new TRPCError({
message: "The linked page contains no metadata",
code: "INTERNAL_SERVER_ERROR",
})
}
return nodeList
}
function jsonObjectIsRecipe(jsonObject: Record<string, unknown>): boolean {
const parsed = JsonLdRecipeSchema.safeParse(jsonObject)
if (parsed.success) {
if (parsed.data["@type"].toLowerCase().includes("recipe")) return true
}
return false
}
function jsonObjectHasGraph(jsonObject: Record<string, unknown>): boolean {
return Object.prototype.hasOwnProperty.call(jsonObject, "@graph")
}
function getSchemaRecipeFromNodeList(nodeList: NodeList) {
for (const node of nodeList.values()) {
const { textContent } = node
if (!textContent) {
logger.debug("No text content in node, trying next node")
continue
}
// eslint-disable-next-line @typescript-eslint/no-explicit-any
let parsedNodeContent: any
try {
parsedNodeContent = JSON.parse(textContent)
} catch (e) {
logger.error(
{ error: e, textContent },
"Error in extracting JSON from node content"
)
continue
}
if (Array.isArray(parsedNodeContent)) {
console.log("its an array")
for (const metadataObject of parsedNodeContent) {
if (jsonObjectIsRecipe(metadataObject)) {
return metadataObject
}
}
} else {
if (jsonObjectIsRecipe(parsedNodeContent)) {
return parsedNodeContent
}
if (jsonObjectHasGraph(parsedNodeContent)) {
for (const graphNode of parsedNodeContent["@graph"]) {
if (jsonObjectIsRecipe(graphNode)) {
return graphNode
}
}
}
}
}
throw new Error("Unable to extract Recipe metadata from provided url")
}
export async function hydrateRecipe(url: string) {
const nodeList: NodeList = await getNodeListOfMetadataNodesFromUrl(url)
const recipeData = getSchemaRecipeFromNodeList(nodeList)
const steps = RecipeStepSchema.array().safeParse(
recipeData.recipeInstructions
)
const ingredients: string[] = recipeData.recipeIngredient
.flat()
.map((ingredient: string) => ingredient.trim())
const image = RecipeImageUrlSchema.safeParse(recipeData.image)
if (!steps.success) {
throw new Error("Could not parse steps")
}
const ings: Pick<InsertIngredient, "scrapedName">[] = ingredients.map(
(a) => ({ scrapedName: a })
)
const servings = ExtractNumberSchema.safeParse(recipeData.recipeYield)
const recipe: InsertRecipe = {
name: recipeData.name,
url,
steps: steps.data.join("\n"),
imageUrl: image.success ? image.data : undefined,
servings: servings.success ? servings.data : undefined,
}
return { recipe, ingredients: ings }
}