-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathedgar_import.cypher
212 lines (206 loc) · 7.22 KB
/
edgar_import.cypher
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
/**
* Load Form 10-K and Form 13 data from remote CSV files.
*
* The process is:
*
* 1. Load Form 10-K data from json files, creatinge one `(:Form)` per file
* 2. Migrate the text sections of the Form 10-K to `(:Chunk)` nodes, one per section
* 3. Connect `(:Form)-[:SECTION]->(:Chunk)` relationships
* 4. Split each section text into chunks of 1000 words and create a `(:Chunk)` for each chunk
* 5. Create a linked list of `(:Chunk)-[:NEXT]->(:Chunk)` relationships
* 6. Generate embeddings for each chunk using the OpenAI API
* 7. Load Form 13 data from a CSV file, creating `(:Company)` and `(:Manager)` nodes
* 8. Create `(:Manager)-[:OWNS_STOCK_IN]->(:Company)` relationships
* 9. Connect `(:Company)-[:FILED]->(:Form)` relationships
*
* The resulting graph will look like this:
*
* @graph ```
* (:Form => {
* formId :: string!, // a unique identifier for the form
* source :: string!, // a link back to the original 10k document
* summary :: string, // text summary generated with the LLM **NOTE: not yet implemented! **
* summaryEmbeddings: list<float> // vector embedding of summary **NOTE: not yet implemented! **
* })
*
* (:Chunk => {
* chunkId :: string!, // a unique identifier for the chunk
* text :: string!, // the text of the chunk
* textEmbedding :: list<float> // vector embedding of the text
* })
*
* // @kind contains
* // @synonyms
* (:Form)=[:SECTION => { item :: string }]=>(:Chunk)
*
* // @kind peer
* // @antonym previous
* (:Chunk)=[:NEXT^1]=>(:Chunk)
*
* // @kind membership
* (:Chunk)=[:PART_OF^1]->(:Form)
*
* (:Company => {
* cik :: int!, // the Central Index Key for the company
* cusip6:: string!, // the CUSIP6 identifier for the company
* name :: string!, // the name of the company
* names :: list<string>, // list of alternative names for the company
* cusip:: string, // known CUSIP identifier for the company
* address :: string // the address of the company **NOTE: not yet implemented! **
* })
*
* (:Manager {
* cik :: int!, // the Central Index Key for the manager
* name :: string, // the name of the manager
* address :: string // the address of the manager
* })
*
* (:Manager)=[:OWNS_STOCK_IN]=>(:Company)
* (:Company)=[:FILED]=>(:Form)
* ```
*
* @module LoadEdgarKG
* @plugins apoc, genai
* @param openAiApiKey::string - OpenAI API key
* @param baseURL::string - Base URL for the data files
*/
// parameters with default values
:params {
openAiApiKey: "paste your OpenAI API key here",
baseURL: "https://raw.githubusercontent.com/neo4j-examples/sec-edgar-notebooks/main/data/sample/"
}
;
MERGE (kg:KnowledgeGraph {name: "EdgarKG"})
ON CREATE SET kg.createdAt = datetime()
ON MATCH SET kg.lastOperation = datetime(),
kg.sources = [$baseURL + 'form10k/*', $baseURL + 'form13.csv']
RETURN kg.name as name // first statement in a module must RETURN module name
;
////////////////////////////////////////////////
// Load Form 10-K data
// create constraints
CREATE CONSTRAINT unique_form IF NOT EXISTS
FOR (n:Form) REQUIRE n.formId IS UNIQUE
;
CREATE CONSTRAINT unique_chunk IF NOT EXISTS
FOR (c:Chunk) REQUIRE c.chunkId IS UNIQUE
;
// create vector index for form 10-K chunks
CREATE VECTOR INDEX `form_10k_chunks` IF NOT EXISTS
FOR (c:Chunk) ON (c.textEmbedding)
OPTIONS { indexConfig: {
`vector.dimensions`: 1536,
`vector.similarity_function`: 'cosine'
}}
;
//Required in Spring AI (for v0.8.1 and previous) to avoid app error that tries to create if doesn't exist
CREATE CONSTRAINT chunk_unique_idx IF NOT EXISTS
FOR (n:Chunk) REQUIRE n.id IS UNIQUE
;
// Load each individual form 10-K document
LOAD CSV WITH HEADERS from $baseURL + 'form10k/index.csv' AS row
WITH row.filename as filename
CALL {
WITH filename
WITH filename, apoc.text.regexGroups(filename,'([^\/]*)\.json')[0][1] AS formId
CALL apoc.load.json($baseURL + 'form10k/' + filename) YIELD value
MERGE (f:Form {formId: formId})
ON CREATE SET f = value, f.formId = formId
}
;
// Migrate the form 10-K text to a Chunk
WITH ['item1','item1a','item7','item7a'] as items
UNWIND items as item
CALL {
WITH item
MATCH (f:Form)
WITH f, item, "0000" as chunkSeqId
WITH f, item, chunkSeqId, f.formId + "-" + item + "-chunk" + chunkSeqId as chunkId
MERGE (section:Chunk {chunkId: chunkId})
ON CREATE SET
section.text = apoc.any.property(f, item)
MERGE (f)-[:SECTION {item: item}]->(section)
MERGE (section)-[:PART_OF]->(f)
}
;
// Remove form section texts from the form nodes themselves
MATCH (f:Form)
SET f.item1 = null, f.item1a = null, f.item7 = null, f.item7a = null
;
// Split the text into chunks of 1000 words
MATCH (f:Form)-[s:SECTION]->(first:Chunk)
WITH f, s, first
WITH f, s, first, apoc.text.split(first.text, "\s+") as tokens
CALL apoc.coll.partition(tokens, 1000) YIELD value
WITH f, s, first, apoc.text.join(value, " ") as chunk
WITH f, s, first, collect(chunk) as chunks
CALL {
WITH f, s, first, chunks
WITH f, s, first, chunks, [idx in range(1, size(chunks) -1) |
{ chunkId: f.formId + "-" + s.item + "-chunk" + apoc.number.format(idx, "#0000"), text: chunks[idx] }] as chunkProps
CALL apoc.create.nodes(["Chunk"], chunkProps) yield node
SET first.text = head(chunks)
MERGE (node)-[:PART_OF]->(f)
WITH first, collect(node) as chunkNodes
CALL apoc.nodes.link(chunkNodes, 'NEXT')
WITH first, head(chunkNodes) as nextNode
MERGE (first)-[:NEXT]->(nextNode)
}
RETURN f.formId
;
//Spring AI requires id field on vector-indexed nodes
MATCH (chunk:Chunk) WHERE chunk.id IS NULL
SET chunk.id = chunk.chunkId
;
////////////////////////////////////////////////////////////////
// Load Form 13 data
CREATE CONSTRAINT unique_company
IF NOT EXISTS FOR (com:Company)
REQUIRE com.cusip6 IS UNIQUE
;
CREATE FULLTEXT INDEX fullTextCompanyNames
IF NOT EXISTS
FOR (com:Company)
ON EACH [com.names]
;
CREATE CONSTRAINT unique_manager
IF NOT EXISTS
FOR (n:Manager)
REQUIRE n.cik IS UNIQUE
;
CREATE FULLTEXT INDEX fullTextManagerNames
IF NOT EXISTS
FOR (mgr:Manager)
ON EACH [mgr.name]
;
LOAD CSV WITH HEADERS FROM $baseURL + "form13.csv" as row
MERGE (com:Company {cusip6: row.cusip6})
ON CREATE SET com.name = row.companyName,
com.cusip = row.cusip
MERGE (mgr:Manager {cik: toInteger(row.managerCik)})
ON CREATE SET mgr.name = row.managerName,
mgr.address = row.managerAddress
MERGE (mgr)-[owns:OWNS_STOCK_IN {
reportCalendarOrQuarter: row.reportCalendarOrQuarter }]->(com)
ON CREATE
SET owns.value = toFloat(row.value),
owns.shares = toInteger(row.shares)
;
MATCH (com:Company), (form:Form)
WHERE com.cusip6 = form.cusip6
SET com.names = form.names,
com.cik = form.cik
SET form.names = null,
form.cik = null,
form.cusip6 = null,
form.cusip = null
MERGE (com)-[:FILED]->(form)
;
// Generate embeddings for each chunk. This may take a while.
MATCH (chunk:Chunk) WHERE chunk.textEmbedding IS NULL
CALL {
WITH chunk
WITH chunk, genai.vector.encode(chunk.text, "OpenAI", {token: $openAiApiKey}) AS vector
CALL db.create.setNodeVectorProperty(chunk, "textEmbedding", vector)
} IN TRANSACTIONS OF 10 ROWS
;