-
Notifications
You must be signed in to change notification settings - Fork 150
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add owner, source and size to GoogleDriveLoaders Document's metadata. #179
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -47,6 +47,108 @@ class GoogleDriveLoader(BaseLoader, BaseModel): | |
"""The file loader kwargs to use.""" | ||
load_auth: bool = False | ||
"""Whether to load authorization identities.""" | ||
load_extended_metadata: bool = False | ||
"""Whether to load extended metadata.""" | ||
|
||
def _get_file_size_from_id(self, id: str) -> str: | ||
"""Fetch the size of the file.""" | ||
try: | ||
import googleapiclient.errors # type: ignore[import] | ||
from googleapiclient.discovery import build # type: ignore[import] | ||
except ImportError as exc: | ||
raise ImportError( | ||
"You must run " | ||
"`pip install --upgrade " | ||
"google-api-python-client` " | ||
"to load authorization identities." | ||
) from exc | ||
|
||
creds = self._load_credentials() | ||
service = build("drive", "v3", credentials=creds) | ||
try: | ||
file = service.files().get(fileId=id, fields="size").execute() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nits: this all can be a small local function (smth like |
||
return file["size"] | ||
except googleapiclient.errors.HttpError: | ||
print( | ||
f"insufficientFilePermissions: The user does not have sufficient \ | ||
permissions to retrieve size for the file with fileId: {id}" | ||
) | ||
return "unknown" | ||
except Exception as exc: | ||
print( | ||
f"Error occurred while fetching the size for the file with fileId: {id}" | ||
) | ||
print(f"Error: {exc}") | ||
return "unknown" | ||
|
||
def _get_owner_metadata_from_id(self, id: str) -> str: | ||
"""Fetch the owner of the file.""" | ||
try: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nits: we should have a local There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am planning to import all the components at the top of the file, (within a try-except block). This will improve readability and clarity of scopes. On the other hand a common internal method like Hope this is fine? |
||
import googleapiclient.errors # type: ignore[import] | ||
from googleapiclient.discovery import build # type: ignore[import] | ||
except ImportError as exc: | ||
raise ImportError( | ||
"You must run " | ||
"`pip install --upgrade " | ||
"google-api-python-client` " | ||
"to load authorization identities." | ||
) from exc | ||
|
||
creds = self._load_credentials() | ||
service = build("drive", "v3", credentials=creds) | ||
try: | ||
file = service.files().get(fileId=id, fields="owners").execute() | ||
return file["owners"][0].get("emailAddress") | ||
except googleapiclient.errors.HttpError: | ||
print( | ||
f"insufficientFilePermissions: The user does not have sufficient \ | ||
permissions to retrieve owner for the file with fileId: {id}" | ||
) | ||
return "unknown" | ||
except Exception as exc: | ||
print( | ||
f"Error occurred while fetching the owner for the file with fileId: \ | ||
{id} with error: {exc}" | ||
) | ||
return "unknown" | ||
|
||
def _get_file_path_from_id(self, id: str) -> str: | ||
"""Fetch the full path of the file starting from the root.""" | ||
try: | ||
import googleapiclient.errors # type: ignore[import] | ||
from googleapiclient.discovery import build # type: ignore[import] | ||
except ImportError as exc: | ||
raise ImportError( | ||
"You must run " | ||
"`pip install --upgrade " | ||
"google-api-python-client` " | ||
"to load authorization identities." | ||
) from exc | ||
|
||
creds = self._load_credentials() | ||
service = build("drive", "v3", credentials=creds) | ||
path = [] | ||
current_id = id | ||
while True: | ||
try: | ||
file = ( | ||
service.files() | ||
.get(fileId=current_id, fields="name, parents") | ||
.execute() | ||
) | ||
path.append(file["name"]) | ||
if "parents" in file: | ||
current_id = file["parents"][0] | ||
else: | ||
break | ||
except googleapiclient.errors.HttpError: | ||
print( | ||
f"insufficientFilePermissions: The user does not have sufficient\ | ||
permissions to retrieve path for the file with fileId: {id}" | ||
) | ||
break | ||
path.reverse() | ||
return "/".join(path) | ||
|
||
def _get_identity_metadata_from_id(self, id: str) -> List[str]: | ||
"""Fetch the list of people having access to ID file.""" | ||
|
@@ -203,6 +305,10 @@ def _load_sheet_from_id(self, id: str) -> List[Document]: | |
sheets = spreadsheet.get("sheets", []) | ||
if self.load_auth: | ||
authorized_identities = self._get_identity_metadata_from_id(id) | ||
if self.load_extended_metadata: | ||
owner = self._get_owner_metadata_from_id(id) | ||
size = self._get_file_size_from_id(id) | ||
full_path = self._get_file_path_from_id(id) | ||
|
||
documents = [] | ||
for sheet in sheets: | ||
|
@@ -229,6 +335,10 @@ def _load_sheet_from_id(self, id: str) -> List[Document]: | |
} | ||
if self.load_auth: | ||
metadata["authorized_identities"] = authorized_identities | ||
if self.load_extended_metadata: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we merge this and above section maybe? |
||
metadata["owner"] = owner | ||
metadata["size"] = size | ||
metadata["full_path"] = full_path | ||
content = [] | ||
for j, v in enumerate(row): | ||
title = header[j].strip() if len(header) > j else "" | ||
|
@@ -251,6 +361,10 @@ def _load_document_from_id(self, id: str) -> Document: | |
service = build("drive", "v3", credentials=creds) | ||
if self.load_auth: | ||
authorized_identities = self._get_identity_metadata_from_id(id) | ||
if self.load_extended_metadata: | ||
owner = self._get_owner_metadata_from_id(id) | ||
size = self._get_file_size_from_id(id) | ||
full_path = self._get_file_path_from_id(id) | ||
|
||
file = ( | ||
service.files() | ||
|
@@ -279,6 +393,10 @@ def _load_document_from_id(self, id: str) -> Document: | |
} | ||
if self.load_auth: | ||
metadata["authorized_identities"] = authorized_identities # type: ignore | ||
if self.load_extended_metadata: | ||
metadata["owner"] = owner | ||
metadata["size"] = size | ||
metadata["full_path"] = full_path | ||
return Document(page_content=text, metadata=metadata) | ||
|
||
def _load_documents_from_folder( | ||
|
@@ -358,6 +476,10 @@ def _load_file_from_id(self, id: str) -> List[Document]: | |
|
||
if self.load_auth: | ||
authorized_identities = self._get_identity_metadata_from_id(id) | ||
if self.load_extended_metadata: | ||
owner = self._get_owner_metadata_from_id(id) | ||
size = self._get_file_size_from_id(id) | ||
full_path = self._get_file_path_from_id(id) | ||
|
||
file = service.files().get(fileId=id, supportsAllDrives=True).execute() | ||
request = service.files().get_media(fileId=id) | ||
|
@@ -377,6 +499,10 @@ def _load_file_from_id(self, id: str) -> List[Document]: | |
doc.metadata["title"] = f"{file.get('name')}" | ||
if self.load_auth: | ||
doc.metadata["authorized_identities"] = authorized_identities | ||
if self.load_extended_metadata: | ||
doc.metadata["owner"] = owner | ||
doc.metadata["size"] = size | ||
doc.metadata["full_path"] = full_path | ||
return docs | ||
|
||
else: | ||
|
@@ -394,6 +520,10 @@ def _load_file_from_id(self, id: str) -> List[Document]: | |
} | ||
if self.load_auth: | ||
metadata["authorized_identities"] = authorized_identities | ||
if self.load_extended_metadata: | ||
metadata["owner"] = owner | ||
metadata["size"] = size | ||
metadata["full_path"] = full_path | ||
docs.append( | ||
Document( | ||
page_content=page.extract_text(), | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
please, note we have groups now, so the hint should look like
pip install langchain-google-community[drive]