Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Follow pagination for >500 emoji #23

Merged
merged 2 commits into from
Jun 21, 2018
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 53 additions & 26 deletions export.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,19 @@
# Export emoji in a Slack team as files
# https://github.com/smashwilson/slack-emojinator

import requests
import lxml.html

import aiohttp
import argparse
import os
import shutil
import asyncio, aiohttp
import asyncio
import logging

from upload import _session
import lxml.html
import os
from typing import List

logging.basicConfig(level=logging.INFO, format="%(asctime)-15s\t%(message)s")
logger = logging.getLogger(__name__)

URL = "https://{team_name}.slack.com/customize/emoji"
BASE_URL = 'https://{team_name}.slack.com'
EMOJI_ENDPOINT = '/customize/emoji'


def _argparse():
Expand All @@ -41,13 +39,15 @@ def _argparse():
parser.add_argument(
'--concurrent-requests', '-r',
default=os.getenv('CONCURRENT_REQUESTS', 200),
type=int,
help='Maximum concurrent requests. Defaults to the $CONCURRENT_REQUESTS environment variable or 200.'
)
args = parser.parse_args()
return args

def concurrent_http_get(num_chunks: int, session: aiohttp.ClientSession):
semaphore = asyncio.Semaphore(num_chunks)

def concurrent_http_get(max_concurrent: int, session: aiohttp.ClientSession):
semaphore = asyncio.Semaphore(max_concurrent)

async def http_get(url, name):
nonlocal semaphore
Expand All @@ -56,47 +56,74 @@ async def http_get(url, name):
body = await response.content.read()
await response.wait_for_close()
return body, name, url

return http_get

def handle_response(response, name: str, url: str, directory: str):

def save_to_file(response, name: str, url: str, directory: str):
logger.info(f"Got {name.ljust(15)} {url}")
ext = url.split(".")[-1]
with open(os.path.join(directory, f"{name}.{ext}"), 'wb') as out:
out.write(response)


def parse_emoji_from_page(text: str) -> List[str]:
'''Given the text of an HTML page, retrieve a list of (relative) URLs to emoji.
:param text Raw HTML.
:return ['/path/to/first.png', '/path/to/second.png', ...]'''
tree = lxml.html.fromstring(text)
urls = tree.xpath(r'//td[@headers="custom_emoji_image"]/span/@data-original')
return urls


def _async_session(auth_cookie):
return aiohttp.ClientSession(headers={"Cookie": auth_cookie})


async def main():
args = _argparse()

if not os.path.exists(args.directory):
os.makedirs(args.directory)

base_url = BASE_URL.format(team_name=args.team_name)
emoji_url = base_url + EMOJI_ENDPOINT

async with _async_session(args.cookie) as session:
endpoint = URL.format(team_name=args.team_name)
logger.info(f"Getting {endpoint}")
resp = await session.get(endpoint)
async with resp:
if resp.status != 200:
logger.error(f"Failed to retrieve emoji list ({resp.status})")
logger.info(f"Getting {emoji_url}")

async with session.get(emoji_url) as base_page_q:
if base_page_q.status != 200:
logger.error(f"Failed to retrieve emoji list ({base_page_q.status})")
return
text = await resp.text()
text = await base_page_q.text()
tree = lxml.html.fromstring(text)
urls = tree.xpath(r'//td[@headers="custom_emoji_image"]/span/@data-original')
names = [u.split('/')[-2] for u in urls]

logger.info(f"Parsed {len(names)} emojis")
assert len(names) > 0
emoji_urls = []
emoji_urls.extend(parse_emoji_from_page(text))
other_emoji_pages = [f"{base_url}{p}" for p in
tree.xpath(r'//div[@class="pagination pagination-centered"]'
r'/ul/li/a[.!="Next"]/@href[.!="#"]')
if p != EMOJI_ENDPOINT]
logger.info(f"Getting other emoji from: {other_emoji_pages}")
for emoji_page in other_emoji_pages:
async with session.get(f"{emoji_page}") as page:
text = await page.text()
emoji_urls.extend(parse_emoji_from_page(text))

emoji_names = [u.split('/')[-2] for u in emoji_urls]

logger.info(f"Parsed {len(emoji_names)} emojis")
assert len(emoji_names) > 0

http_get = concurrent_http_get(args.concurrent_requests, session)
tasks = [http_get(emoji_url, emoji_name) for emoji_name, emoji_url in zip(names, urls) if "alias" not in emoji_url]
tasks = [http_get(emoji_url, emoji_name) for emoji_name, emoji_url in zip(emoji_names, emoji_urls)
if "alias" not in emoji_url]
for future in asyncio.as_completed(tasks):
data, name, url = await future
handle_response(data, name, url, args.directory)
save_to_file(data, name, url, args.directory)


if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.run_until_complete(main())