From 7d7490f7f2235ba0389db58d301c7efe2df41abe Mon Sep 17 00:00:00 2001 From: John Date: Wed, 1 Jan 2025 22:57:23 -0800 Subject: [PATCH] added metadata tutorial --- contributors.md | 3 +- ...mission_attachments_metadata_dataset.ipynb | 95 --- examples/submission_metadata_tutorial.ipynb | 633 ++++++++++++++++++ 3 files changed, 635 insertions(+), 96 deletions(-) delete mode 100644 examples/construct_tenk_submission_attachments_metadata_dataset.ipynb create mode 100644 examples/submission_metadata_tutorial.ipynb diff --git a/contributors.md b/contributors.md index 3ecfc954..6fdab18a 100644 --- a/contributors.md +++ b/contributors.md @@ -2,4 +2,5 @@ contributors: * john-friedman * dermonito * AnirudhJM24 -* GuangzheLeLe Wang \ No newline at end of file +* GuangzheLeLe Wang +* Owen Gordon \ No newline at end of file diff --git a/examples/construct_tenk_submission_attachments_metadata_dataset.ipynb b/examples/construct_tenk_submission_attachments_metadata_dataset.ipynb deleted file mode 100644 index eb490154..00000000 --- a/examples/construct_tenk_submission_attachments_metadata_dataset.ipynb +++ /dev/null @@ -1,95 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "WIP DONT USE\n", - "Demo for creating a dataset of the attachments of a 10k\n", - "\n", - " (root_form, document_type, filing_date, report_date, report year...), firm level information (cik, ticker, name...)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Cost: $0.000270000000 downloads + $0.000211797000 row reads = $0.000481797000\n", - "Balance: $8.210451053000\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Processing files: 100%|██████████| 27/27 [00:03<00:00, 7.24it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Processing completed in 3.74 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "source": [ - "from datamule import PremiumDownloader, Portfolio\n", - "\n", - "downloader = PremiumDownloader() # Downloader() also works, but is slower\n", - "\n", - "# Download the data\n", - "downloader.download_submissions(submission_type=['10-K'],ticker='AAPL',output_dir='ingest/appl_10k')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The structure" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.5" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/examples/submission_metadata_tutorial.ipynb b/examples/submission_metadata_tutorial.ipynb new file mode 100644 index 00000000..93b2ca49 --- /dev/null +++ b/examples/submission_metadata_tutorial.ipynb @@ -0,0 +1,633 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# SEC Submission Metadata\n", + "\n", + "Submissions uploaded to the SEC contain metadata with information about the company and the submission. This metadata is stored in SGML, of which the SEC has two types: archive sgml, and edgar sgml.\n", + "\n", + "## EDGAR SGML metadata\n", + "```\n", + "0000950170-22-000796.txt : 20220207\n", + "0000950170-22-000796.hdr.sgml : 20220207\n", + "20220204201127\n", + "ACCESSION NUMBER:\t\t0000950170-22-000796\n", + "CONFORMED SUBMISSION TYPE:\t10-K\n", + "PUBLIC DOCUMENT COUNT:\t\t134\n", + "CONFORMED PERIOD OF REPORT:\t20211231\n", + "FILED AS OF DATE:\t\t20220207\n", + "DATE AS OF CHANGE:\t\t20220204\n", + "\n", + "FILER:\n", + "\n", + "\tCOMPANY DATA:\t\n", + "\t\tCOMPANY CONFORMED NAME:\t\t\tTesla, Inc.\n", + "\t\tCENTRAL INDEX KEY:\t\t\t0001318605\n", + "\t\tSTANDARD INDUSTRIAL CLASSIFICATION:\tMOTOR VEHICLES & PASSENGER CAR BODIES [3711]\n", + "\t\tIRS NUMBER:\t\t\t\t912197729\n", + "\t\tSTATE OF INCORPORATION:\t\t\tDE\n", + "\t\tFISCAL YEAR END:\t\t\t1231\n", + "\n", + "\tFILING VALUES:\n", + "\t\tFORM TYPE:\t\t10-K\n", + "\t\tSEC ACT:\t\t1934 Act\n", + "\t\tSEC FILE NUMBER:\t001-34756\n", + "\t\tFILM NUMBER:\t\t22595227\n", + "\n", + "\tBUSINESS ADDRESS:\t\n", + "\t\tSTREET 1:\t\t3500 DEER CREEK RD\n", + "\t\tCITY:\t\t\tPALO ALTO\n", + "\t\tSTATE:\t\t\tCA\n", + "\t\tZIP:\t\t\t94304\n", + "\t\tBUSINESS PHONE:\t\t650-681-5000\n", + "\n", + "\tMAIL ADDRESS:\t\n", + "\t\tSTREET 1:\t\t3500 DEER CREEK RD\n", + "\t\tCITY:\t\t\tPALO ALTO\n", + "\t\tSTATE:\t\t\tCA\n", + "\t\tZIP:\t\t\t94304\n", + "\n", + "\tFORMER COMPANY:\t\n", + "\t\tFORMER CONFORMED NAME:\tTESLA MOTORS INC\n", + "\t\tDATE OF NAME CHANGE:\t20050222\n", + "\n", + "```\n", + "\n", + "## Archive SGML metadata\n", + "```\n", + "\n", + "0000950170-24-009914\n", + "3\n", + "1\n", + "20240201\n", + "20240201\n", + "20240201\n", + "\n", + "\n", + "THIEL PETER\n", + "0001211060\n", + "\n", + "\n", + "\n", + "3\n", + "34\n", + "001-41942\n", + "24588225\n", + "\n", + "\n", + "9200 SUNSET BOULEVARD\n", + "SUITE 1110\n", + "WEST HOLLYWOOD\n", + "CA\n", + "90069\n", + "323-990-2000\n", + "\n", + "\n", + "9200 SUNSET BOULEVARD\n", + "SUITE 1110\n", + "WEST HOLLYWOOD\n", + "CA\n", + "90069\n", + "\n", + "\n", + "\n", + "\n", + "Mithril LP\n", + "0001552273\n", + "\n", + "DE\n", + "1231\n", + "\n", + "\n", + "3\n", + "34\n", + "001-41942\n", + "24588230\n", + "\n", + "\n", + "ONE LETTERMAN DRIVE\n", + "BUILDING C SUITE 400\n", + "SAN FRANCISCO\n", + "CA\n", + "94129\n", + "415-248-5147\n", + "\n", + "\n", + "ONE LETTERMAN DRIVE\n", + "BUILDING C SUITE 400\n", + "SAN FRANCISCO\n", + "CA\n", + "94129\n", + "\n", + "\n", + "\n", + "\n", + "MITHRIL II LP\n", + "0001669609\n", + "\n", + "DE\n", + "1231\n", + "\n", + "\n", + "3\n", + "34\n", + "001-41942\n", + "24588227\n", + "\n", + "\n", + "ONE LETTERMAN DRIVE, BLDG. A, SUITE 4900\n", + "SAN FRANCISCO\n", + "CA\n", + "94129\n", + "(415) 659-8940\n", + "\n", + "\n", + "ONE LETTERMAN DRIVE, BLDG. A, SUITE 4900\n", + "SAN FRANCISCO\n", + "CA\n", + "94129\n", + "\n", + "\n", + "\n", + "\n", + "Mithril GP LP\n", + "0001881891\n", + "\n", + "DE\n", + "1231\n", + "\n", + "\n", + "3\n", + "34\n", + "001-41942\n", + "24588226\n", + "\n", + "\n", + "C/O MITHRIL CAPITAL MANAGEMENT LLC\n", + "600 CONGRESS AVENUE, SUITE 3100\n", + "AUSTIN\n", + "TX\n", + "78701\n", + "512-717-3770\n", + "\n", + "\n", + "C/O MITHRIL CAPITAL MANAGEMENT LLC\n", + "600 CONGRESS AVENUE, SUITE 3100\n", + "AUSTIN\n", + "TX\n", + "78701\n", + "\n", + "\n", + "\n", + "\n", + "Mithril II UGP LLC\n", + "0001878230\n", + "\n", + "DE\n", + "1231\n", + "\n", + "\n", + "3\n", + "34\n", + "001-41942\n", + "24588229\n", + "\n", + "\n", + "600 CONGRESS AVENUE\n", + "SUITE 3100\n", + "AUSTIN\n", + "TX\n", + "78701\n", + "512-717-3770\n", + "\n", + "\n", + "600 CONGRESS AVENUE\n", + "SUITE 3100\n", + "AUSTIN\n", + "TX\n", + "78701\n", + "\n", + "\n", + "\n", + "\n", + "Mithril II GP LP\n", + "0001878232\n", + "\n", + "DE\n", + "1231\n", + "\n", + "\n", + "3\n", + "34\n", + "001-41942\n", + "24588228\n", + "\n", + "\n", + "600 CONGRESS AVENUE\n", + "SUITE 3100\n", + "AUSTIN\n", + "TX\n", + "78701\n", + "512-717-3770\n", + "\n", + "\n", + "600 CONGRESS AVENUE\n", + "SUITE 3100\n", + "AUSTIN\n", + "TX\n", + "78701\n", + "\n", + "\n", + "\n", + "\n", + "Fractyl Health, Inc.\n", + "0001572616\n", + "3841\n", + "08 Industrial Applications and Services\n", + "273553477\n", + "DE\n", + "1231\n", + "\n", + "\n", + "17 HARTWELL AVENUE\n", + "LEXINGTON\n", + "MA\n", + "02421\n", + "781-902-8800\n", + "\n", + "\n", + "17 HARTWELL AVENUE\n", + "LEXINGTON\n", + "MA\n", + "02421\n", + "\n", + "\n", + "Fractyl Laboratories Inc.\n", + "20130320\n", + "\n", + "\n", + "```\n", + "\n", + "## Differences between variants\n", + "\n", + "Both EDGAR and archive SGML contain the same information. However, there are differences in key value storage. Archive variant uses dashes, while EDGAR variant uses spaces. Additionally, some values are abbreviated depending on the variant.\n", + "\n", + "## Datamule's Submission Structure\n", + "\n", + "Datamule parses the SGML submission file and splits them into the individual documents. It also creates a metadata.json file which contains the parsed metadata.\n", + "\n", + "For example, the downloaded file structure for Tesla's 2022 10-K would look like\n", + "\n", + "000095017022000796/\n", + "\n", + "Metadata:\n", + "├── metadata.json\n", + "\n", + "Primary Documents:\n", + "├── tsla-20211231.htm (10-K)\n", + "├── tsla-ex21_1.htm (EX-21.1)\n", + "├── tsla-ex23_1.htm (EX-23.1)\n", + "├── tsla-ex31_1.htm (EX-31.1)\n", + "├── tsla-ex31_2.htm (EX-31.2)\n", + "├── tsla-ex32_1.htm (EX-32.1)\n", + "├── img96779317_0.jpg (GRAPHIC)\n", + "\n", + "Data Files:\n", + "├── tsla-20211231_def.xml (EX-101.DEF)\n", + "├── tsla-20211231_pre.xml (EX-101.PRE)\n", + "├── tsla-20211231_lab.xml (EX-101.LAB)\n", + "├── tsla-20211231.xsd (EX-101.SCH)\n", + "├── tsla-20211231_cal.xml (EX-101.CAL)\n", + "└── tsla-20211231_htm.xml (XBRL Instance)\n", + "\n", + "We'll now look at the metadata.json file for tesla" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Cost: $0.000010000000 downloads + $0.000007541000 row reads = $0.000017541000\n", + "Balance: $8.209951715000\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing files: 100%|██████████| 1/1 [00:01<00:00, 1.69s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Processing completed in 1.69 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "from datamule import PremiumDownloader, Downloader\n", + "\n", + "downloader = PremiumDownloader() # Downloader also works, but PremiumDownloader is faster\n", + "downloader.download_submissions(submission_type=['10-K'],ticker = 'TSLA', filing_date = ('2022-01-01', '2022-12-31'), output_dir = 'ingest/tesla')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## metadata.json\n", + "\n", + "```\n", + "{\n", + " \"submission\": {\n", + " \"ACCESSION-NUMBER\": \"0000950170-22-000796\",\n", + " \"TYPE\": \"10-K\",\n", + " \"PUBLIC-DOCUMENT-COUNT\": \"134\",\n", + " \"PERIOD\": \"20211231\",\n", + " \"FILING-DATE\": \"20220207\",\n", + " \"DATE-OF-FILING-DATE-CHANGE\": \"20220204\",\n", + " \"FILER\": {\n", + " \"COMPANY-DATA\": {\n", + " \"CONFORMED-NAME\": \"Tesla, Inc.\",\n", + " \"CIK\": \"0001318605\",\n", + " \"ASSIGNED-SIC\": \"3711\",\n", + " \"IRS-NUMBER\": \"912197729\",\n", + " \"STATE-OF-INCORPORATION\": \"DE\",\n", + " \"FISCAL-YEAR-END\": \"1231\"\n", + " },\n", + " \"FILING-VALUES\": {\n", + " \"FORM-TYPE\": \"10-K\",\n", + " \"ACT\": \"34\",\n", + " \"FILE-NUMBER\": \"001-34756\",\n", + " \"FILM-NUMBER\": \"22595227\"\n", + " },\n", + " \"BUSINESS-ADDRESS\": {\n", + " \"STREET1\": \"3500 DEER CREEK RD\",\n", + " \"CITY\": \"PALO ALTO\",\n", + " \"STATE\": \"CA\",\n", + " \"ZIP\": \"94304\",\n", + " \"PHONE\": \"650-681-5000\"\n", + " },\n", + " \"MAIL-ADDRESS\": {\n", + " \"STREET1\": \"3500 DEER CREEK RD\",\n", + " \"CITY\": \"PALO ALTO\",\n", + " \"STATE\": \"CA\",\n", + " \"ZIP\": \"94304\"\n", + " },\n", + " \"FORMER-COMPANY\": {\n", + " \"FORMER-CONFORMED-NAME\": \"TESLA MOTORS INC\",\n", + " \"DATE-CHANGED\": \"20050222\"\n", + " }\n", + " }\n", + " },\n", + " \"documents\": [\n", + " {\n", + " \"TYPE\": \"10-K\",\n", + " \"SEQUENCE\": \"1\",\n", + " \"FILENAME\": \"tsla-20211231.htm\",\n", + " \"DESCRIPTION\": \"10-K\"\n", + " },\n", + " {\n", + " \"TYPE\": \"EX-21.1\",\n", + " \"SEQUENCE\": \"2\",\n", + " \"FILENAME\": \"tsla-ex21_1.htm\",\n", + " \"DESCRIPTION\": \"EX-21.1\"\n", + " },\n", + " {\n", + " \"TYPE\": \"EX-23.1\",\n", + " \"SEQUENCE\": \"3\",\n", + " \"FILENAME\": \"tsla-ex23_1.htm\",\n", + " \"DESCRIPTION\": \"EX-23.1\"\n", + " },...\n", + "```\n", + "\n", + "## Interacting with metadata using datamule\n", + "\n", + "The easiest way to use metadata in datamule is through the Portfolio class." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loading 1 submissions\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading submissions: 100%|██████████| 1/1 [00:00<00:00, 986.90it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'submission': {'ACCESSION-NUMBER': '0000950170-22-000796', 'TYPE': '10-K', 'PUBLIC-DOCUMENT-COUNT': '134', 'PERIOD': '20211231', 'FILING-DATE': '20220207', 'DATE-OF-FILING-DATE-CHANGE': '20220204', 'FILER': {'COMPANY-DATA': {'CONFORMED-NAME': 'Tesla, Inc.', 'CIK': '0001318605', 'ASSIGNED-SIC': '3711', 'IRS-NUMBER': '912197729', 'STATE-OF-INCORPORATION': 'DE', 'FISCAL-YEAR-END': '1231'}, 'FILING-VALUES': {'FORM-TYPE': '10-K', 'ACT': '34', 'FILE-NUMBER': '001-34756', 'FILM-NUMBER': '22595227'}, 'BUSINESS-ADDRESS': {'STREET1': '3500 DEER CREEK RD', 'CITY': 'PALO ALTO', 'STATE': 'CA', 'ZIP': '94304', 'PHONE': '650-681-5000'}, 'MAIL-ADDRESS': {'STREET1': '3500 DEER CREEK RD', 'CITY': 'PALO ALTO', 'STATE': 'CA', 'ZIP': '94304'}, 'FORMER-COMPANY': {'FORMER-CONFORMED-NAME': 'TESLA MOTORS INC', 'DATE-CHANGED': '20050222'}}}, 'documents': [{'TYPE': '10-K', 'SEQUENCE': '1', 'FILENAME': 'tsla-20211231.htm', 'DESCRIPTION': '10-K'}, {'TYPE': 'EX-21.1', 'SEQUENCE': '2', 'FILENAME': 'tsla-ex21_1.htm', 'DESCRIPTION': 'EX-21.1'}, {'TYPE': 'EX-23.1', 'SEQUENCE': '3', 'FILENAME': 'tsla-ex23_1.htm', 'DESCRIPTION': 'EX-23.1'}, {'TYPE': 'EX-31.1', 'SEQUENCE': '4', 'FILENAME': 'tsla-ex31_1.htm', 'DESCRIPTION': 'EX-31.1'}, {'TYPE': 'EX-31.2', 'SEQUENCE': '5', 'FILENAME': 'tsla-ex31_2.htm', 'DESCRIPTION': 'EX-31.2'}, {'TYPE': 'EX-32.1', 'SEQUENCE': '6', 'FILENAME': 'tsla-ex32_1.htm', 'DESCRIPTION': 'EX-32.1'}, {'TYPE': 'GRAPHIC', 'SEQUENCE': '7', 'FILENAME': 'img96779317_0.jpg', 'DESCRIPTION': 'GRAPHIC'}, {'TYPE': 'EX-101.DEF', 'SEQUENCE': '8', 'FILENAME': 'tsla-20211231_def.xml', 'DESCRIPTION': 'XBRL TAXONOMY EXTENSION DEFINITION LINKBASE DOCUMENT'}, {'TYPE': 'EX-101.PRE', 'SEQUENCE': '9', 'FILENAME': 'tsla-20211231_pre.xml', 'DESCRIPTION': 'XBRL TAXONOMY EXTENSION PRESENTATION LINKBASE DOCUMENT'}, {'TYPE': 'EX-101.LAB', 'SEQUENCE': '10', 'FILENAME': 'tsla-20211231_lab.xml', 'DESCRIPTION': 'XBRL TAXONOMY EXTENSION LABEL LINKBASE DOCUMENT'}, {'TYPE': 'EX-101.SCH', 'SEQUENCE': '11', 'FILENAME': 'tsla-20211231.xsd', 'DESCRIPTION': 'XBRL TAXONOMY EXTENSION SCHEMA DOCUMENT'}, {'TYPE': 'EX-101.CAL', 'SEQUENCE': '12', 'FILENAME': 'tsla-20211231_cal.xml', 'DESCRIPTION': 'XBRL TAXONOMY EXTENSION CALCULATION LINKBASE DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '13', 'FILENAME': 'R1.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '14', 'FILENAME': 'R2.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '15', 'FILENAME': 'R3.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '16', 'FILENAME': 'R4.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '17', 'FILENAME': 'R5.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '18', 'FILENAME': 'R6.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '19', 'FILENAME': 'R7.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '20', 'FILENAME': 'R8.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '21', 'FILENAME': 'R9.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '22', 'FILENAME': 'R10.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '23', 'FILENAME': 'R11.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '24', 'FILENAME': 'R12.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '25', 'FILENAME': 'R13.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '26', 'FILENAME': 'R14.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '27', 'FILENAME': 'R15.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '28', 'FILENAME': 'R16.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '29', 'FILENAME': 'R17.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '30', 'FILENAME': 'R18.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '31', 'FILENAME': 'R19.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '32', 'FILENAME': 'R20.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '33', 'FILENAME': 'R21.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '34', 'FILENAME': 'R22.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '35', 'FILENAME': 'R23.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '36', 'FILENAME': 'R24.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '37', 'FILENAME': 'R25.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '38', 'FILENAME': 'R26.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '39', 'FILENAME': 'R27.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '40', 'FILENAME': 'R28.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '41', 'FILENAME': 'R29.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '42', 'FILENAME': 'R30.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '43', 'FILENAME': 'R31.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '44', 'FILENAME': 'R32.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '45', 'FILENAME': 'R33.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '46', 'FILENAME': 'R34.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '47', 'FILENAME': 'R35.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '48', 'FILENAME': 'R36.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '49', 'FILENAME': 'R37.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '50', 'FILENAME': 'R38.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '51', 'FILENAME': 'R39.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '52', 'FILENAME': 'R40.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '53', 'FILENAME': 'R41.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '54', 'FILENAME': 'R42.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '55', 'FILENAME': 'R43.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '56', 'FILENAME': 'R44.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '57', 'FILENAME': 'R45.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '58', 'FILENAME': 'R46.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '59', 'FILENAME': 'R47.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '60', 'FILENAME': 'R48.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '61', 'FILENAME': 'R49.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '62', 'FILENAME': 'R50.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '63', 'FILENAME': 'R51.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '64', 'FILENAME': 'R52.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '65', 'FILENAME': 'R53.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '66', 'FILENAME': 'R54.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '67', 'FILENAME': 'R55.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '68', 'FILENAME': 'R56.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '69', 'FILENAME': 'R57.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '70', 'FILENAME': 'R58.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '71', 'FILENAME': 'R59.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '72', 'FILENAME': 'R60.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '73', 'FILENAME': 'R61.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '74', 'FILENAME': 'R62.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '75', 'FILENAME': 'R63.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '76', 'FILENAME': 'R64.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '77', 'FILENAME': 'R65.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '78', 'FILENAME': 'R66.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '79', 'FILENAME': 'R67.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '80', 'FILENAME': 'R68.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '81', 'FILENAME': 'R69.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '82', 'FILENAME': 'R70.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '83', 'FILENAME': 'R71.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '84', 'FILENAME': 'R72.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '85', 'FILENAME': 'R73.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '86', 'FILENAME': 'R74.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '87', 'FILENAME': 'R75.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '88', 'FILENAME': 'R76.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '89', 'FILENAME': 'R77.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '90', 'FILENAME': 'R78.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '91', 'FILENAME': 'R79.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '92', 'FILENAME': 'R80.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '93', 'FILENAME': 'R81.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '94', 'FILENAME': 'R82.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '95', 'FILENAME': 'R83.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '96', 'FILENAME': 'R84.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '97', 'FILENAME': 'R85.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '98', 'FILENAME': 'R86.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '99', 'FILENAME': 'R87.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '100', 'FILENAME': 'R88.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '101', 'FILENAME': 'R89.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '102', 'FILENAME': 'R90.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '103', 'FILENAME': 'R91.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '104', 'FILENAME': 'R92.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '105', 'FILENAME': 'R93.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '106', 'FILENAME': 'R94.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '107', 'FILENAME': 'R95.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '108', 'FILENAME': 'R96.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '109', 'FILENAME': 'R97.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '110', 'FILENAME': 'R98.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '111', 'FILENAME': 'R99.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '112', 'FILENAME': 'R100.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '113', 'FILENAME': 'R101.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '114', 'FILENAME': 'R102.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '115', 'FILENAME': 'R103.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '116', 'FILENAME': 'R104.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '117', 'FILENAME': 'R105.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '118', 'FILENAME': 'R106.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '119', 'FILENAME': 'R107.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '120', 'FILENAME': 'R108.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '121', 'FILENAME': 'R109.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '122', 'FILENAME': 'R110.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '123', 'FILENAME': 'R111.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '124', 'FILENAME': 'R112.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '125', 'FILENAME': 'R113.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '126', 'FILENAME': 'R114.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '127', 'FILENAME': 'R115.htm', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '128', 'FILENAME': 'tsla-20211231_htm.xml', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'EXCEL', 'SEQUENCE': '129', 'FILENAME': 'Financial_Report.xlsx', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '130', 'FILENAME': 'Show.js', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '131', 'FILENAME': 'report.css', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'XML', 'SEQUENCE': '132', 'FILENAME': 'FilingSummary.xml', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'JSON', 'SEQUENCE': '135', 'FILENAME': 'MetaLinks.json', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}, {'TYPE': 'ZIP', 'SEQUENCE': '136', 'FILENAME': '0000950170-22-000796-xbrl.zip', 'DESCRIPTION': 'IDEA: XBRL DOCUMENT'}]}\n", + "0000950170-22-000796\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "from datamule import Portfolio\n", + "\n", + "# multithreaded to be faster\n", + "portfolio = Portfolio('ingest/tesla')\n", + "\n", + "for submission in portfolio:\n", + " # printing the metadata of the submission\n", + " print(submission.metadata)\n", + "\n", + " # accessing a key in the metadata\n", + " print(submission.metadata['submission']['ACCESSION-NUMBER'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Currently the EDGAR and archive metadata variants are not standardized. This is an inconvenience, and I'm looking into harmonizing it. If this is important to your workflow please post a GitHub issue and I'll move it up the priority list.\n", + "\n", + "Lots of things to work on! Last updated 1/1/2025." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example\n", + "\n", + "Simple Demo for creating a datasets of attachments to 10-K with metadata. Requested via email from a user who wanted an alternative to a product offered by WRDS SEC Suites. If you're reading this, hi :)\n", + "\n", + "The demo will focus on Apple, and will create a csv with:\n", + "\n", + "* filename - the filename of a file in the submission, e.g. a10-kexhibit10202017.htm\n", + "* submission_type - the type of submission, e.g. 10-K\n", + "* document_type - the type of document, e.g. GRAPHIC, EX-22.1, 10-K. (e.g. the form and its attachments)\n", + "* filing_date\n", + "* report_date - reporting period, e.g. 20170930\n", + "* report_year - extracted from first 4 digits of reporting period\n", + "* cik - company's unique identifier\n", + "* ticker - company's current ticker - see cautionary note\n", + "* company_conformed_name - company's name, e.g. APPLE INC" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Progress [Rate: 5.0/s | 0.01 MB/s]: 100%|██████████| 24/24 [00:02<00:00, 11.24it/s]\n", + "Progress [Rate: 5.0/s | 10.83 MB/s]: 100%|██████████| 24/24 [00:05<00:00, 4.50it/s]\n" + ] + }, + { + "data": { + "text/plain": [ + "([], [])" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Download the data\n", + "from datamule import Downloader\n", + "\n", + "downloader = Downloader()\n", + "\n", + "# Download the data\n", + "downloader.download_submissions(submission_type=['10-K'],ticker='AAPL',output_dir='ingest/appl_10k')" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loading 24 submissions\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading submissions: 100%|██████████| 24/24 [00:00<00:00, 1535.53it/s]\n" + ] + } + ], + "source": [ + "from datamule import Portfolio, load_package_dataset\n", + "import pandas as pd\n", + "\n", + "portfolio = Portfolio('ingest/appl_10k')\n", + "\n", + "# Create lists to store document data\n", + "data = []\n", + "\n", + "# Note: You may have to adjust the metadata keys to match the structure of the data\n", + "for submission in portfolio:\n", + " metadata = submission.metadata\n", + " sub_info = metadata['submission']\n", + " \n", + " # Extract base submission info\n", + " base_info = {\n", + " 'submission_type': sub_info['CONFORMED SUBMISSION TYPE'],\n", + " 'filing_date': sub_info['FILED AS OF DATE'],\n", + " 'report_date': sub_info['CONFORMED PERIOD OF REPORT'],\n", + " 'report_year': sub_info['CONFORMED PERIOD OF REPORT'][:4],\n", + " 'cik': sub_info['FILER']['COMPANY DATA']['CENTRAL INDEX KEY'],\n", + " 'company_conformed_name': sub_info['FILER']['COMPANY DATA']['COMPANY CONFORMED NAME'],\n", + " }\n", + " \n", + " # Add each document with the base info\n", + " for doc in metadata['documents']: # documents is at root level\n", + " row = base_info.copy()\n", + " row['filename'] = doc['FILENAME']\n", + " row['document_type'] = doc['TYPE']\n", + " data.append(row)\n", + "\n", + "# Create DataFrame and save to CSV\n", + "df = pd.DataFrame(data)\n", + "df = df[['filename', 'submission_type', 'document_type', 'filing_date', \n", + " 'report_date', 'report_year', 'cik', 'company_conformed_name']]\n", + "\n", + "# now we merge in ticker data\n", + "ticker_df = pd.DataFrame(load_package_dataset('company_tickers')) # dataset is stored as a list of dictionaries so as to allow alternatives to pandas, like polars.\n", + "ticker_df = ticker_df[['cik', 'ticker']]\n", + "df = df.merge(ticker_df, on='cik', how='left')\n", + "\n", + "df.to_csv('data/apple_10k_attachments.csv', index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Cautionary note: Tickers change over time. `company_tickers` dataset is based on the current SEC provided ticker dataset." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}