Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Save full Firefox profile #917

Merged
merged 2 commits into from
May 7, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 2 additions & 12 deletions docs/Configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ of configurations of `class<BrowserParams>`.
- `data_directory`
- The directory into which screenshots and page dumps will be saved
- [Intended to be removed by #232](https://github.com/mozilla/OpenWPM/issues/232)
- `log_directory` -> supported file extensions are `.log`
- `log_path` -> supported file extensions are `.log`
- The path to the file in which OpenWPM will log. The
directory given will be created if it does not exist.
- `failure_limit` -> has to be either of type `int` or `None`
Expand Down Expand Up @@ -287,17 +287,7 @@ browser before visiting the next `site` in `sites`.

### Loading and saving a browser profile

It's possible to load and save profiles during stateful crawls. Profile dumps
currently consist of the following browser storage items:

- cookies
- localStorage
- IndexedDB
- browser history

Other browser state, such as the browser cache, is not saved. In
[Issue #62](https://github.com/citp/OpenWPM/issues/62) we plan to expand
profiles to include all browser storage.
It's possible to load and save profiles during stateful crawls.

#### Save a profile

Expand Down
1 change: 1 addition & 0 deletions openwpm/browser_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -674,6 +674,7 @@ def _start_extension(self, browser_profile_path: Path) -> ClientSocket:
with open(ep_filename, "rt") as f:
port = int(f.read().strip())

ep_filename.unlink()
self.logger.debug(
"BROWSER %i: Connecting to extension on port %i"
% (self.browser_params.browser_id, port)
Expand Down
46 changes: 10 additions & 36 deletions openwpm/commands/profile_commands.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import logging
import shutil
import tarfile
from pathlib import Path

Expand Down Expand Up @@ -47,47 +46,22 @@ def dump_profile(
% (browser_params.browser_id, browser_profile_path, tar_path)
)

storage_vector_files = [
tar.add(browser_profile_path, arcname="")
archived_items = tar.getnames()
tar.close()

required_items = [
"cookies.sqlite", # cookies
"cookies.sqlite-shm",
"cookies.sqlite-wal",
"places.sqlite", # history
"places.sqlite-shm",
"places.sqlite-wal",
"webappsstore.sqlite", # localStorage
"webappsstore.sqlite-shm",
"webappsstore.sqlite-wal",
]
storage_vector_dirs = [
"webapps", # related to localStorage?
"storage", # directory for IndexedDB
]
for item in storage_vector_files:
full_path = browser_profile_path / item
if (
not full_path.is_file()
and not full_path.name.endswith("shm")
and not full_path.name.endswith("wal")
):
for item in required_items:
if item not in archived_items:
logger.critical(
"BROWSER %i: %s NOT FOUND IN profile folder, skipping."
% (browser_params.browser_id, full_path)
"BROWSER %i: %s NOT FOUND IN profile folder"
% (browser_params.browser_id, item)
)
elif not full_path.is_file() and (
full_path.name.endswith("shm") or full_path.name.endswith("wal")
):
continue # These are just checkpoint files
tar.add(full_path, arcname=item)
for item in storage_vector_dirs:
full_path = browser_profile_path / item
if not full_path.is_dir():
logger.warning(
"BROWSER %i: %s NOT FOUND IN profile folder, skipping."
% (browser_params.browser_id, full_path)
)
continue
tar.add(full_path, arcname=item)
tar.close()
raise RuntimeError("Profile dump not successful")


class DumpProfileCommand(BaseCommand):
Expand Down
32 changes: 31 additions & 1 deletion test/test_profile.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import tarfile
from pathlib import Path
from typing import Any

Expand All @@ -22,7 +23,36 @@ def test_saving(default_params, task_manager_creator):
manager, _ = task_manager_creator((manager_params, browser_params[:1]))
manager.get(BASE_TEST_URL)
manager.close()
assert (browser_params[0].profile_archive_dir / "profile.tar.gz").is_file()
tar_path = browser_params[0].profile_archive_dir / "profile.tar.gz"
assert tar_path.is_file()
# Test that the archived profile contains some basic items
profile_items = [
"cookies.sqlite",
"places.sqlite",
"webappsstore.sqlite",
"prefs.js",
"bookmarkbackups",
"cache2",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see this file in my profile but since the tests are passing it must be some quirk in my setup I guess

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Although I couldn't find this documented anywhere, after experimenting a little and reading the comments in https://stackoverflow.com/questions/25623705/selenium-retaining-firefox-cache-and-history-files, I concluded that when running Firefox with Selenium the browser cache is stored inside the profile, under the cache2 directory, while when running Firefox normally it is stored under ~/.cache/mozilla/firefox/xxxxx/cache2, where xxxxx is the same as the profile directory name.

"storage",
]
with tarfile.open(tar_path, "r:gz") as tar:
archive_items = tar.getnames()
for item in profile_items:
assert item in archive_items


def test_save_incomplete_profile_error(default_params, task_manager_creator):
manager_params, browser_params = default_params
manager_params.num_browsers = 1
browser_params[0].profile_archive_dir = (
manager_params.data_directory / "browser_profile"
)
manager, _ = task_manager_creator((manager_params, browser_params[:1]))
manager.get(BASE_TEST_URL)
(manager.browsers[0].current_profile_path / "cookies.sqlite").unlink()
with pytest.raises(RuntimeError) as error:
manager.close()
assert str(error.value) == "Profile dump not successful"


def test_crash_profile(default_params, task_manager_creator):
Expand Down