Skip to content

Commit

Permalink
Initial implementation of aiohttp export
Browse files Browse the repository at this point in the history
Makes the export script require Python 3.6.
(async/await were added in 3.5, so maybe there?)

I'll need to tidy it up a bit, but this is v1 of functionality.

Adds -r/--concurrent-requests for throttling. Defaults to 200.
(configured in .env as CONCURRENT_REQUESTS)

Caveats:
- Adds an aiohttp dependency.
- Stops using upload._session, effectively duplicating the
functionality to get access to aiohttp.ClientSession.
- Adds logging to record the files downloaded. Previously silent.

I've also noticed a bug in filename parsing where a bunch of files all
named `apple.png` are created. This script parses the URL to retrieve
the filename, which exposes this duplication. My version does more
parsing in the HTML to detect the :emoji_name: as used by Slack clients.
Currently I'm not addressing this.
  • Loading branch information
d-lord committed Oct 25, 2017
1 parent a9f3ea6 commit 2d4e355
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 20 deletions.
1 change: 1 addition & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ export SLACK_TEAM=
export SLACK_COOKIE=
export EMOJI_NAME_PREFIX=
export EMOJI_NAME_SUFFIX=
export CONCURRENT_REQUESTS=
78 changes: 58 additions & 20 deletions export.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,22 @@
# Export emoji in a Slack team as files
# https://github.com/smashwilson/slack-emojinator

from __future__ import print_function

import requests
import lxml.html

import argparse
import os
import shutil
import asyncio, aiohttp
import logging

from upload import _session

logging.basicConfig(level=logging.INFO, format="%(asctime)-15s\t%(message)s")
logger = logging.getLogger(__name__)

URL = "https://{team_name}.slack.com/customize/emoji"


def _argparse():
parser = argparse.ArgumentParser(
Expand All @@ -33,32 +38,65 @@ def _argparse():
default=os.getenv('SLACK_COOKIE'),
help='Defaults to the $SLACK_COOKIE environment variable.'
)
parser.add_argument(
'--concurrent-requests', '-r',
default=os.getenv('CONCURRENT_REQUESTS', 200),
help='Maximum concurrent requests. Defaults to the $CONCURRENT_REQUESTS environment variable or 200.'
)
args = parser.parse_args()
return args

def concurrent_http_get(num_chunks: int, session: aiohttp.ClientSession):
semaphore = asyncio.Semaphore(num_chunks)

async def http_get(url, name):
nonlocal semaphore
with (await semaphore):
response = await session.get(url)
body = await response.content.read()
await response.wait_for_close()
return body, name, url
return http_get

def main():
def handle_response(response, name: str, url: str, directory: str):
logger.info(f"Got {name.ljust(15)} {url}")
ext = url.split(".")[-1]
with open(os.path.join(directory, f"{name}.{ext}"), 'wb') as out:
out.write(response)

def _async_session(auth_cookie):
return aiohttp.ClientSession(headers={"Cookie": auth_cookie})

async def main():
args = _argparse()

if not os.path.exists(args.directory):
os.makedirs(args.directory)

session = _session(args)
resp = session.get(session.url)
tree = lxml.html.fromstring(resp.text)
urls = tree.xpath(r'//td[@headers="custom_emoji_image"]/span/@data-original')
names = [u.split('/')[-2] for u in urls]

for emoji_name, emoji_url in zip(names, urls):
if "alias" not in emoji_url: # this does not seem necessary ...
file_extension = emoji_url.split(".")[-1]
request = session.get(emoji_url, stream=True)
if request.status_code == 200:
filename = '%s/%s.%s' % (args.directory, emoji_name,
file_extension)
with open(filename, 'wb') as out_file:
shutil.copyfileobj(request.raw, out_file)
del request
async with _async_session(args.cookie) as session:
endpoint = URL.format(team_name=args.team_name)
logger.info(f"Getting {endpoint}")
resp = await session.get(endpoint)
async with resp:
if resp.status != 200:
logger.error(f"Failed to retrieve emoji list ({resp.status})")
return
text = await resp.text()
tree = lxml.html.fromstring(text)
urls = tree.xpath(r'//td[@headers="custom_emoji_image"]/span/@data-original')
names = [u.split('/')[-2] for u in urls]

logger.info(f"Parsed {len(names)} emojis")
assert len(names) > 0

http_get = concurrent_http_get(args.concurrent_requests, session)
tasks = [http_get(emoji_url, emoji_name) for emoji_name, emoji_url in zip(names, urls) if "alias" not in emoji_url]
for future in asyncio.as_completed(tasks):
data, name, url = await future
handle_response(data, name, url, args.directory)


if __name__ == '__main__':
main()
loop = asyncio.get_event_loop()
loop.run_until_complete(main())

1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
beautifulsoup4>=4.4, <5.0
requests>=2.5.3, <3.0
lxml==3.7.3
aiohttp==2.3.1

0 comments on commit 2d4e355

Please sign in to comment.