Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add owner, source and size to GoogleDriveLoaders Document's metadata. #179

Merged
merged 1 commit into from
Apr 26, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
130 changes: 130 additions & 0 deletions libs/community/langchain_google_community/drive.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,108 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
"""The file loader kwargs to use."""
load_auth: bool = False
"""Whether to load authorization identities."""
load_extended_metadata: bool = False
"""Whether to load extended metadata."""

def _get_file_size_from_id(self, id: str) -> str:
"""Fetch the size of the file."""
try:
import googleapiclient.errors # type: ignore[import]
from googleapiclient.discovery import build # type: ignore[import]
except ImportError as exc:
raise ImportError(
"You must run "
"`pip install --upgrade "
Copy link
Collaborator

@lkuligin lkuligin Apr 25, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please, note we have groups now, so the hint should look like pip install langchain-google-community[drive]

"google-api-python-client` "
"to load authorization identities."
) from exc

creds = self._load_credentials()
service = build("drive", "v3", credentials=creds)
try:
file = service.files().get(fileId=id, fields="size").execute()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nits: this all can be a small local function (smth like _load_attribute and it can take fields and credentials as parameters)

return file["size"]
except googleapiclient.errors.HttpError:
print(
f"insufficientFilePermissions: The user does not have sufficient \
permissions to retrieve size for the file with fileId: {id}"
)
return "unknown"
except Exception as exc:
print(
f"Error occurred while fetching the size for the file with fileId: {id}"
)
print(f"Error: {exc}")
return "unknown"

def _get_owner_metadata_from_id(self, id: str) -> str:
"""Fetch the owner of the file."""
try:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nits: we should have a local _import_lib function to avoid copy-pasting here and there

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am planning to import all the components at the top of the file, (within a try-except block).

This will improve readability and clarity of scopes. On the other hand a common internal method like _import_lib will lead to importing the modules multiple times. Also the imports will be limited to the scope of _import_lib method (which we will have to tackle by returning to calling methods or making it global).

Hope this is fine?

import googleapiclient.errors # type: ignore[import]
from googleapiclient.discovery import build # type: ignore[import]
except ImportError as exc:
raise ImportError(
"You must run "
"`pip install --upgrade "
"google-api-python-client` "
"to load authorization identities."
) from exc

creds = self._load_credentials()
service = build("drive", "v3", credentials=creds)
try:
file = service.files().get(fileId=id, fields="owners").execute()
return file["owners"][0].get("emailAddress")
except googleapiclient.errors.HttpError:
print(
f"insufficientFilePermissions: The user does not have sufficient \
permissions to retrieve owner for the file with fileId: {id}"
)
return "unknown"
except Exception as exc:
print(
f"Error occurred while fetching the owner for the file with fileId: \
{id} with error: {exc}"
)
return "unknown"

def _get_file_path_from_id(self, id: str) -> str:
"""Fetch the full path of the file starting from the root."""
try:
import googleapiclient.errors # type: ignore[import]
from googleapiclient.discovery import build # type: ignore[import]
except ImportError as exc:
raise ImportError(
"You must run "
"`pip install --upgrade "
"google-api-python-client` "
"to load authorization identities."
) from exc

creds = self._load_credentials()
service = build("drive", "v3", credentials=creds)
path = []
current_id = id
while True:
try:
file = (
service.files()
.get(fileId=current_id, fields="name, parents")
.execute()
)
path.append(file["name"])
if "parents" in file:
current_id = file["parents"][0]
else:
break
except googleapiclient.errors.HttpError:
print(
f"insufficientFilePermissions: The user does not have sufficient\
permissions to retrieve path for the file with fileId: {id}"
)
break
path.reverse()
return "/".join(path)

def _get_identity_metadata_from_id(self, id: str) -> List[str]:
"""Fetch the list of people having access to ID file."""
Expand Down Expand Up @@ -203,6 +305,10 @@ def _load_sheet_from_id(self, id: str) -> List[Document]:
sheets = spreadsheet.get("sheets", [])
if self.load_auth:
authorized_identities = self._get_identity_metadata_from_id(id)
if self.load_extended_metadata:
owner = self._get_owner_metadata_from_id(id)
size = self._get_file_size_from_id(id)
full_path = self._get_file_path_from_id(id)

documents = []
for sheet in sheets:
Expand All @@ -229,6 +335,10 @@ def _load_sheet_from_id(self, id: str) -> List[Document]:
}
if self.load_auth:
metadata["authorized_identities"] = authorized_identities
if self.load_extended_metadata:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we merge this and above section maybe?
metadata["owner"] = self._get_owner_metadata_from_id(id)

metadata["owner"] = owner
metadata["size"] = size
metadata["full_path"] = full_path
content = []
for j, v in enumerate(row):
title = header[j].strip() if len(header) > j else ""
Expand All @@ -251,6 +361,10 @@ def _load_document_from_id(self, id: str) -> Document:
service = build("drive", "v3", credentials=creds)
if self.load_auth:
authorized_identities = self._get_identity_metadata_from_id(id)
if self.load_extended_metadata:
owner = self._get_owner_metadata_from_id(id)
size = self._get_file_size_from_id(id)
full_path = self._get_file_path_from_id(id)

file = (
service.files()
Expand Down Expand Up @@ -279,6 +393,10 @@ def _load_document_from_id(self, id: str) -> Document:
}
if self.load_auth:
metadata["authorized_identities"] = authorized_identities # type: ignore
if self.load_extended_metadata:
metadata["owner"] = owner
metadata["size"] = size
metadata["full_path"] = full_path
return Document(page_content=text, metadata=metadata)

def _load_documents_from_folder(
Expand Down Expand Up @@ -358,6 +476,10 @@ def _load_file_from_id(self, id: str) -> List[Document]:

if self.load_auth:
authorized_identities = self._get_identity_metadata_from_id(id)
if self.load_extended_metadata:
owner = self._get_owner_metadata_from_id(id)
size = self._get_file_size_from_id(id)
full_path = self._get_file_path_from_id(id)

file = service.files().get(fileId=id, supportsAllDrives=True).execute()
request = service.files().get_media(fileId=id)
Expand All @@ -377,6 +499,10 @@ def _load_file_from_id(self, id: str) -> List[Document]:
doc.metadata["title"] = f"{file.get('name')}"
if self.load_auth:
doc.metadata["authorized_identities"] = authorized_identities
if self.load_extended_metadata:
doc.metadata["owner"] = owner
doc.metadata["size"] = size
doc.metadata["full_path"] = full_path
return docs

else:
Expand All @@ -394,6 +520,10 @@ def _load_file_from_id(self, id: str) -> List[Document]:
}
if self.load_auth:
metadata["authorized_identities"] = authorized_identities
if self.load_extended_metadata:
metadata["owner"] = owner
metadata["size"] = size
metadata["full_path"] = full_path
docs.append(
Document(
page_content=page.extract_text(),
Expand Down
Loading