Skip to content

Commit

Permalink
Add owner, source and size to GoogleDriveLoaders Document's metadata
Browse files Browse the repository at this point in the history
Signed-off-by: Rahul Tripathi <[email protected]>
  • Loading branch information
Rahul Tripathi authored and rahul-trip committed Apr 25, 2024
1 parent 56d91df commit 7cc873b
Showing 1 changed file with 130 additions and 0 deletions.
130 changes: 130 additions & 0 deletions libs/community/langchain_google_community/drive.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,108 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
"""The file loader kwargs to use."""
load_auth: bool = False
"""Whether to load authorization identities."""
load_extended_metadata: bool = False
"""Whether to load extended metadata."""

def _get_file_size_from_id(self, id: str) -> str:
"""Fetch the size of the file."""
try:
import googleapiclient.errors # type: ignore[import]
from googleapiclient.discovery import build # type: ignore[import]
except ImportError as exc:
raise ImportError(
"You must run "
"`pip install --upgrade "
"google-api-python-client` "
"to load authorization identities."
) from exc

creds = self._load_credentials()
service = build("drive", "v3", credentials=creds)
try:
file = service.files().get(fileId=id, fields="size").execute()
return file["size"]
except googleapiclient.errors.HttpError:
print(
f"insufficientFilePermissions: The user does not have sufficient \
permissions to retrieve size for the file with fileId: {id}"
)
return "unknown"
except Exception as exc:
print(
f"Error occurred while fetching the size for the file with fileId: {id}"
)
print(f"Error: {exc}")
return "unknown"

def _get_owner_metadata_from_id(self, id: str) -> str:
"""Fetch the owner of the file."""
try:
import googleapiclient.errors # type: ignore[import]
from googleapiclient.discovery import build # type: ignore[import]
except ImportError as exc:
raise ImportError(
"You must run "
"`pip install --upgrade "
"google-api-python-client` "
"to load authorization identities."
) from exc

creds = self._load_credentials()
service = build("drive", "v3", credentials=creds)
try:
file = service.files().get(fileId=id, fields="owners").execute()
return file["owners"][0].get("emailAddress")
except googleapiclient.errors.HttpError:
print(
f"insufficientFilePermissions: The user does not have sufficient \
permissions to retrieve owner for the file with fileId: {id}"
)
return "unknown"
except Exception as exc:
print(
f"Error occurred while fetching the owner for the file with fileId: \
{id} with error: {exc}"
)
return "unknown"

def _get_file_path_from_id(self, id: str) -> str:
"""Fetch the full path of the file starting from the root."""
try:
import googleapiclient.errors # type: ignore[import]
from googleapiclient.discovery import build # type: ignore[import]
except ImportError as exc:
raise ImportError(
"You must run "
"`pip install --upgrade "
"google-api-python-client` "
"to load authorization identities."
) from exc

creds = self._load_credentials()
service = build("drive", "v3", credentials=creds)
path = []
current_id = id
while True:
try:
file = (
service.files()
.get(fileId=current_id, fields="name, parents")
.execute()
)
path.append(file["name"])
if "parents" in file:
current_id = file["parents"][0]
else:
break
except googleapiclient.errors.HttpError:
print(
f"insufficientFilePermissions: The user does not have sufficient\
permissions to retrieve path for the file with fileId: {id}"
)
break
path.reverse()
return "/".join(path)

def _get_identity_metadata_from_id(self, id: str) -> List[str]:
"""Fetch the list of people having access to ID file."""
Expand Down Expand Up @@ -203,6 +305,10 @@ def _load_sheet_from_id(self, id: str) -> List[Document]:
sheets = spreadsheet.get("sheets", [])
if self.load_auth:
authorized_identities = self._get_identity_metadata_from_id(id)
if self.load_extended_metadata:
owner = self._get_owner_metadata_from_id(id)
size = self._get_file_size_from_id(id)
full_path = self._get_file_path_from_id(id)

documents = []
for sheet in sheets:
Expand All @@ -229,6 +335,10 @@ def _load_sheet_from_id(self, id: str) -> List[Document]:
}
if self.load_auth:
metadata["authorized_identities"] = authorized_identities
if self.load_extended_metadata:
metadata["owner"] = owner
metadata["size"] = size
metadata["full_path"] = full_path
content = []
for j, v in enumerate(row):
title = header[j].strip() if len(header) > j else ""
Expand All @@ -251,6 +361,10 @@ def _load_document_from_id(self, id: str) -> Document:
service = build("drive", "v3", credentials=creds)
if self.load_auth:
authorized_identities = self._get_identity_metadata_from_id(id)
if self.load_extended_metadata:
owner = self._get_owner_metadata_from_id(id)
size = self._get_file_size_from_id(id)
full_path = self._get_file_path_from_id(id)

file = (
service.files()
Expand Down Expand Up @@ -279,6 +393,10 @@ def _load_document_from_id(self, id: str) -> Document:
}
if self.load_auth:
metadata["authorized_identities"] = authorized_identities # type: ignore
if self.load_extended_metadata:
metadata["owner"] = owner
metadata["size"] = size
metadata["full_path"] = full_path
return Document(page_content=text, metadata=metadata)

def _load_documents_from_folder(
Expand Down Expand Up @@ -358,6 +476,10 @@ def _load_file_from_id(self, id: str) -> List[Document]:

if self.load_auth:
authorized_identities = self._get_identity_metadata_from_id(id)
if self.load_extended_metadata:
owner = self._get_owner_metadata_from_id(id)
size = self._get_file_size_from_id(id)
full_path = self._get_file_path_from_id(id)

file = service.files().get(fileId=id, supportsAllDrives=True).execute()
request = service.files().get_media(fileId=id)
Expand All @@ -377,6 +499,10 @@ def _load_file_from_id(self, id: str) -> List[Document]:
doc.metadata["title"] = f"{file.get('name')}"
if self.load_auth:
doc.metadata["authorized_identities"] = authorized_identities
if self.load_extended_metadata:
doc.metadata["owner"] = owner
doc.metadata["size"] = size
doc.metadata["full_path"] = full_path
return docs

else:
Expand All @@ -394,6 +520,10 @@ def _load_file_from_id(self, id: str) -> List[Document]:
}
if self.load_auth:
metadata["authorized_identities"] = authorized_identities
if self.load_extended_metadata:
metadata["owner"] = owner
metadata["size"] = size
metadata["full_path"] = full_path
docs.append(
Document(
page_content=page.extract_text(),
Expand Down

0 comments on commit 7cc873b

Please sign in to comment.