Skip to content

Commit

Permalink
chore(python): Add Type Hints (#5)
Browse files Browse the repository at this point in the history
  • Loading branch information
WilliamEspegren authored Jun 24, 2024
1 parent b3f1580 commit 18ef2d3
Show file tree
Hide file tree
Showing 8 changed files with 68 additions and 63 deletions.
31 changes: 17 additions & 14 deletions bench/scrappy.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,32 +3,35 @@
from scrapy.spiders import CrawlSpider, Rule
from scrapy.crawler import CrawlerProcess
from urllib.parse import urlparse
from scrapy.settings import Settings
from scrapy.http import Response
from typing import List, Type, Union, ClassVar

url = len(sys.argv) > 1 and str(sys.argv[1]) or "https://rsseau.fr"
host = urlparse(url).hostname
url: str = len(sys.argv) > 1 and str(sys.argv[1]) or "https://rsseau.fr"
host: str = urlparse(url).hostname

class MySpider(CrawlSpider):
name = host
allowed_domains = [host]
start_urls = [url]
links = []
rules = (
name: Union[str, None] = host
allowed_domains: List[str] = [host]
start_urls: List[str] = [url]
links: List[str] = []
rules: ClassVar[tuple] = (
Rule(LinkExtractor(), callback='parse_item', follow=True),
)

@classmethod
def update_settings(cls, settings):
def update_settings(cls: Type['MySpider'], settings: Settings) -> None:
super().update_settings(settings)
settings.set("LOG_ENABLED", "false", priority="spider")

def parse_item(self, response):
def parse_item(self, response: Response) -> None:
self.links.append(response.url)

print("benching scrappy(python)...")
process = CrawlerProcess()
spider = MySpider
start = time.time()
process: CrawlerProcess = CrawlerProcess()
spider: Type[MySpider] = MySpider
start: float = time.time()
process.crawl(spider)
process.start()
end = time.time()
print(url, "pages found " + str(len(spider.links)), "elasped duration " + str(end - start) + "s", sep="\n")
end: float = time.time()
print(url, "pages found " + str(len(spider.links)), "elapsed duration " + str(end - start) + "s", sep="\n")
18 changes: 9 additions & 9 deletions bench/spider.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
import asyncio, time, sys
import time
from typing import List
from spider_rs import Website

async def main():
async def main() -> None:
print("benching spider-rs(python)...")
url = len(sys.argv) > 1 and str(sys.argv[1]) or "https://rsseau.fr"
website = Website(url)
start = time.time()
url: str = len(sys.argv) > 1 and str(sys.argv[1]) or "https://rsseau.fr"
website: Website = Website(url)
start: float = time.time()
website.crawl()
end = time.time()
links = website.get_links()
print(url, "pages found " + str(len(links)), "elasped duration " + str(end - start) + "s", sep="\n")
end: float = time.time()
links: List[str] = website.get_links()
print(url, "pages found " + str(len(links)), "elapsed duration " + str(end - start) + "s", sep="\n")

asyncio.run(main())
asyncio.run(main())
10 changes: 5 additions & 5 deletions examples/basic.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import asyncio
from typing import List
from spider_rs import crawl, Website

from spider_rs import crawl

async def main():
website = await crawl("https://choosealicense.com")
async def main() -> None:
website: Website = await crawl("https://choosealicense.com")
print(website.links)

asyncio.run(main())
asyncio.run(main())
7 changes: 3 additions & 4 deletions examples/builder.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@
import asyncio

from typing import List
from spider_rs import Website

async def main():
website = (
async def main() -> None:
website: Website = (
Website("https://choosealicense.com", False)
.with_user_agent("BotBot")
.with_headers({"authorization": "Something "})
)
website.crawl()
print(website.get_links())


asyncio.run(main())
7 changes: 3 additions & 4 deletions examples/screenshot.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import asyncio

from typing import Dict, Any
from spider_rs import Website

async def main():
website = (
async def main() -> None:
website: Website = (
Website("https://choosealicense.com", False)
.with_screenshot({
"params": {
Expand All @@ -25,5 +25,4 @@ async def main():
website.crawl(None, None, True)
print(website.get_links())


asyncio.run(main())
23 changes: 12 additions & 11 deletions examples/stop.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
import asyncio
from typing import Any
from spider_rs import Website

async def main():
website = Website("https://www.drake.com")
class Subscription:
def __init__(self) -> None:
print("Subscription Created...")

def __call__(self, page: Any) -> None:
print(f"{page.url} - status: {page.status_code}")
# if (website.size >= 100):
# website.stop()

class Subscription:
def __init__(self):
print("Subscription Created...")
def __call__(self, page):
print(page.url + " - status: " + str(page.status_code))
# if (website.size >= 100):
# website.stop()

async def main() -> None:
website: Website = Website("https://www.drake.com")
website.crawl(Subscription())

asyncio.run(main())
asyncio.run(main())
21 changes: 11 additions & 10 deletions examples/subscription.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
import asyncio
import time

from typing import Any
from spider_rs import Website

class Subscription:
def __init__(self):
print("Subscription Created...")
def __call__(self, page):
print(page.url + " - status: " + str(page.status_code))
def __init__(self) -> None:
print("Subscription Created...")

def __call__(self, page: Any) -> None:
print(f"{page.url} - status: {page.status_code}")

async def main():
website = Website("https://www.drake.com").with_budget({ "*": 200 })
a = time.time()
async def main() -> None:
website: Website = Website("https://www.drake.com").with_budget({"*": 200})
start_time: float = time.time()
website.crawl(Subscription())
print("time " + str((time.time() - a)))
print(f"time {time.time() - start_time}")

asyncio.run(main())
asyncio.run(main())
14 changes: 8 additions & 6 deletions examples/website.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
import asyncio
import time
from typing import List
from spider_rs import Website

async def main():
website = Website("https://choosealicense.com", False)
a = time.time()
async def main() -> None:
website: Website = Website("https://choosealicense.com", False)
start_time: float = time.time()
website.crawl()
print(website.get_links())
print("time " + str((time.time() - a)))
links: List[str] = website.get_links()
print(links)
print(f"time {time.time() - start_time}")

asyncio.run(main())
asyncio.run(main())

0 comments on commit 18ef2d3

Please sign in to comment.