From 2d4e3558497b09db83652adbf753b1f9397fa629 Mon Sep 17 00:00:00 2001 From: David Lord Date: Wed, 25 Oct 2017 14:20:28 +1000 Subject: [PATCH 1/2] Initial implementation of aiohttp export Makes the export script require Python 3.6. (async/await were added in 3.5, so maybe there?) I'll need to tidy it up a bit, but this is v1 of functionality. Adds -r/--concurrent-requests for throttling. Defaults to 200. (configured in .env as CONCURRENT_REQUESTS) Caveats: - Adds an aiohttp dependency. - Stops using upload._session, effectively duplicating the functionality to get access to aiohttp.ClientSession. - Adds logging to record the files downloaded. Previously silent. I've also noticed a bug in filename parsing where a bunch of files all named `apple.png` are created. This script parses the URL to retrieve the filename, which exposes this duplication. My version does more parsing in the HTML to detect the :emoji_name: as used by Slack clients. Currently I'm not addressing this. --- .env.example | 1 + export.py | 78 +++++++++++++++++++++++++++++++++++------------- requirements.txt | 1 + 3 files changed, 60 insertions(+), 20 deletions(-) diff --git a/.env.example b/.env.example index 64edf06e..b0eea4ad 100644 --- a/.env.example +++ b/.env.example @@ -2,3 +2,4 @@ export SLACK_TEAM= export SLACK_COOKIE= export EMOJI_NAME_PREFIX= export EMOJI_NAME_SUFFIX= +export CONCURRENT_REQUESTS= \ No newline at end of file diff --git a/export.py b/export.py index 2bd49e64..84ae994c 100755 --- a/export.py +++ b/export.py @@ -3,17 +3,22 @@ # Export emoji in a Slack team as files # https://github.com/smashwilson/slack-emojinator -from __future__ import print_function - import requests import lxml.html import argparse import os import shutil +import asyncio, aiohttp +import logging from upload import _session +logging.basicConfig(level=logging.INFO, format="%(asctime)-15s\t%(message)s") +logger = logging.getLogger(__name__) + +URL = "https://{team_name}.slack.com/customize/emoji" + def _argparse(): parser = argparse.ArgumentParser( @@ -33,32 +38,65 @@ def _argparse(): default=os.getenv('SLACK_COOKIE'), help='Defaults to the $SLACK_COOKIE environment variable.' ) + parser.add_argument( + '--concurrent-requests', '-r', + default=os.getenv('CONCURRENT_REQUESTS', 200), + help='Maximum concurrent requests. Defaults to the $CONCURRENT_REQUESTS environment variable or 200.' + ) args = parser.parse_args() return args +def concurrent_http_get(num_chunks: int, session: aiohttp.ClientSession): + semaphore = asyncio.Semaphore(num_chunks) + + async def http_get(url, name): + nonlocal semaphore + with (await semaphore): + response = await session.get(url) + body = await response.content.read() + await response.wait_for_close() + return body, name, url + return http_get -def main(): +def handle_response(response, name: str, url: str, directory: str): + logger.info(f"Got {name.ljust(15)} {url}") + ext = url.split(".")[-1] + with open(os.path.join(directory, f"{name}.{ext}"), 'wb') as out: + out.write(response) + +def _async_session(auth_cookie): + return aiohttp.ClientSession(headers={"Cookie": auth_cookie}) + +async def main(): args = _argparse() if not os.path.exists(args.directory): os.makedirs(args.directory) - session = _session(args) - resp = session.get(session.url) - tree = lxml.html.fromstring(resp.text) - urls = tree.xpath(r'//td[@headers="custom_emoji_image"]/span/@data-original') - names = [u.split('/')[-2] for u in urls] - - for emoji_name, emoji_url in zip(names, urls): - if "alias" not in emoji_url: # this does not seem necessary ... - file_extension = emoji_url.split(".")[-1] - request = session.get(emoji_url, stream=True) - if request.status_code == 200: - filename = '%s/%s.%s' % (args.directory, emoji_name, - file_extension) - with open(filename, 'wb') as out_file: - shutil.copyfileobj(request.raw, out_file) - del request + async with _async_session(args.cookie) as session: + endpoint = URL.format(team_name=args.team_name) + logger.info(f"Getting {endpoint}") + resp = await session.get(endpoint) + async with resp: + if resp.status != 200: + logger.error(f"Failed to retrieve emoji list ({resp.status})") + return + text = await resp.text() + tree = lxml.html.fromstring(text) + urls = tree.xpath(r'//td[@headers="custom_emoji_image"]/span/@data-original') + names = [u.split('/')[-2] for u in urls] + + logger.info(f"Parsed {len(names)} emojis") + assert len(names) > 0 + + http_get = concurrent_http_get(args.concurrent_requests, session) + tasks = [http_get(emoji_url, emoji_name) for emoji_name, emoji_url in zip(names, urls) if "alias" not in emoji_url] + for future in asyncio.as_completed(tasks): + data, name, url = await future + handle_response(data, name, url, args.directory) + if __name__ == '__main__': - main() + loop = asyncio.get_event_loop() + loop.run_until_complete(main()) + diff --git a/requirements.txt b/requirements.txt index 50143c2c..afce5421 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ beautifulsoup4>=4.4, <5.0 requests>=2.5.3, <3.0 lxml==3.7.3 +aiohttp==2.3.1 \ No newline at end of file From 8bfcb401a3c77e6a494cf362ac746dd17f7e93ea Mon Sep 17 00:00:00 2001 From: David Lord Date: Wed, 24 Jan 2018 16:54:03 +1100 Subject: [PATCH 2/2] Add aiohttp to Pipfile --- Pipfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Pipfile b/Pipfile index ecb68e7f..c240e21d 100644 --- a/Pipfile +++ b/Pipfile @@ -14,3 +14,4 @@ name = "pypi" "beautifulsoup4" = "<5.0,>=4.4" requests = "<3.0,>=2.5.3" lxml = "*" +aiohttp = ">2.3.0"