diff --git a/.gitignore b/.gitignore index c673472..977db3d 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ dist newsscrapper_SamirPS.egg-info Pipfile Pipfile.lock +newsscrapper/__pycache__/ diff --git a/README.md b/README.md index c70b6ae..33e3188 100644 --- a/README.md +++ b/README.md @@ -1,19 +1,15 @@ # News-Scrapper -Web Scrapper for news website get the headlines : https://pypi.org/project/newsscrapper-SamirPS/ -POC : https://newswebs.herokuapp.com/ -TODO: Get the title of the article - Get the description of the article +Web Scrapper for gettings the headlines of some news websites: https://pypi.org/project/newsscrapper-SamirPS/ + To import the function : ## FR: ```python from newsscrapper import fr ``` * LeMonde : fr.MondeSC() -* LeParisien: fr.ParisSC() * MediaPart: fr.MediaPartSC() * BFMTV : fr.BmftvSC() * Libération : fr.LibeSC() -* FranceTv INFO : fr.LibeSC() ## EN: @@ -31,7 +27,8 @@ from newsscrapper import en ```python from newsscrapper import es ``` -* EFE : es.EFESC() * ELPAIS: es.ELPAISSC() -The function return a dict like {"1":{"link":X,"img":Y,"title":Z}} +The function return a dict like {"1":{"link":A,"images":{key_image:B,...},"title":C,"description":D}} + +The ... is for multiple images; key_image can be the "heightxweight" of the picture or a string representing an integer if we don't have the height or weight. \ No newline at end of file diff --git a/newsscrapper/en.py b/newsscrapper/en.py index 846a0d7..2376a45 100644 --- a/newsscrapper/en.py +++ b/newsscrapper/en.py @@ -1,93 +1,71 @@ -import requests -import re +from utils import return_information -def CNNSC(): - CNN=requests.get("http://rss.cnn.com/rss/edition") - arcnn=re.findall(r'(.*?)',CNN.text) - data={} - for i in range(len(arcnn)): - - picture=re.findall(r'(.*?)',arcnn[i]) - title=re.findall(r'<!\[CDATA\[(.*?)]]>',arcnn[i]) - try: - data[str(i)]={"link":link[0],"img":picture[0],"title":title[0].replace(u"\xa0"," ")} - except: - data[str(i)]={"link":link[0],"img":"","title":title[0].replace(u"\xa0"," ")} - return data +def CNNSC(): + return return_information( + "http://rss.cnn.com/rss/edition", + "item", + "title", + "link", + "description", + "media:content", + "width", + "height", + "url", + ) def FoxNewsSC(): - FoxNews=requests.get("http://feeds.foxnews.com/foxnews/latest") - s=FoxNews.text - s=s.replace("\n","") - arfoxnews=re.findall(r'<item>(.*?)</item>',s) - data={} - for i in range(len(arfoxnews)): - picture=re.findall(r'<media:content url="(.*?)"',arfoxnews[i]) - link=re.findall(r'<guid isPermaLink="true">(.*?)</guid>',arfoxnews[i]) - title=re.findall(r'<title>(.*?)',arfoxnews[i]) - try: - data[str(i)]={"link":link[0],"img":picture[0],"title":title[0].replace(u"\xa0"," ")} - except: - data[str(i)]={"link":link[0],"img":"","title":title[0].replace(u"\xa0"," ")} + return return_information( + "http://feeds.foxnews.com/foxnews/latest", + "item", + "title", + "guid", + "description", + "media:content", + "width", + "height", + "url", + ) - return data def ABCNewsSC(): - ABCNews=requests.get("https://abcnews.go.com/abcnews/moneyheadlines") - s=ABCNews.text - s=s.replace("\n","") - arabcnews=re.findall(r'(.*?)',s) - data={} - for i in range(len(arabcnews)): - picture=re.findall(r'',arabcnews[i]) - title=re.findall(r'<!\[CDATA\[(.*?)]]',arabcnews[i]) - try: - data[str(i)]={"link":link[0],"img":picture[0],"title":title[0].replace(u"\xa0"," ")} - except: - data[str(i)]={"link":link[0],"img":"","title":title[0].replace(u"\xa0"," ")} - - return data - + return return_information( + "https://abcnews.go.com/abcnews/moneyheadlines", + "item", + "title", + "link", + "description", + "media:thumbnail", + "width", + "height", + "url", + ) def TheGuardianSC(): - TheGuardianSC=requests.get("https://www.theguardian.com/us-news/rss") - s=TheGuardianSC.text - s=s.replace("\n","") - arguardian=re.findall(r'<item>(.*?)</item>',s) - data={} - for i in range(len(arguardian)): - picture=re.findall(r'<media:content width="140" url="(.*?)"',arguardian[i]) - link=re.findall(r'<link>(.*?)</link>',arguardian[i]) - - title=re.findall(r'<title>(.*?)',arguardian[i]) - try: - picture[0]=picture[0].replace("amp;","") - data[str(i)]={"link":link[0],"img":picture[0],"title":title[0].replace(u"\xa0"," ")} - except: - data[str(i)]={"link":link[0],"img":"","title":title[0].replace(u"\xa0"," ")} + return return_information( + "https://www.theguardian.com/us-news/rss", + "item", + "title", + "guid", + "description", + "media:content", + "width", + "height", + "url", + ) - return data def TheNewYorkTimesSC(): - TheNewYorkTimesSC=requests.get("https://www.nytimes.com/services/xml/rss/nyt/HomePage.xml") - s=TheNewYorkTimesSC.text - s=s.replace("\n","") - arnytimes=re.findall(r'(.*?)',s) - data={} - - for i in range(len(arnytimes)): - picture=re.findall(r'medium="image" url="(.*?)" width',arnytimes[i]) - link=re.findall(r'(.*?)',arnytimes[i]) - title=re.findall(r'(.*?)',arnytimes[i]) - try: - data[str(i)]={"link":link[0],"img":picture[0],"title":title[0].replace(u"\xa0"," ")} - except: - data[str(i)]={"link":link[0],"img":"","title":title[0].replace(u"\xa0"," ")} - - return data - + return return_information( + "https://www.nytimes.com/services/xml/rss/nyt/HomePage.xml", + "item", + "title", + "link", + "description", + "media:content", + "width", + "height", + "url", + ) diff --git a/newsscrapper/es.py b/newsscrapper/es.py index 3f46215..27d3572 100644 --- a/newsscrapper/es.py +++ b/newsscrapper/es.py @@ -1,39 +1,15 @@ -import requests -import re +from utils import return_information -def ELPAISSC(): - ELPAIS=requests.get("http://ep00.epimg.net/rss/tags/ultimas_noticias.xml") - s=ELPAIS.text.replace("\n",'') - arelpais=re.findall(r'(.*?)',s) - data={} - for i in range(len(arelpais)): - - picture=re.findall(r'',arelpais[i]) - title=re.findall(r'<!\[CDATA\[(.*?)]]>',arelpais[i]) - - try: - data[str(i)]={"link":link[0],"img":picture[0],"title":title[0].replace(u"\xa0"," ")} - except: - data[str(i)]={"link":link[0],"img":"","title":title[0].replace(u"\xa0"," ")} - - return data - -def EFESC(): - EFE=requests.get("https://www.efe.com/efe/espana/1/rss") - s=EFE.text.replace("\n",'') - arefe=re.findall(r'<item>(.*?)</item>',s) - data={} - for i in range(len(arefe)): - - picture=re.findall(r'<enclosure url="(.*?)" length',arefe[i]) - link=re.findall(r'<link>(.*?)?utm_source',arefe[i]) - title=re.findall(r'<title>(.*?)',arefe[i]) - - try: - data[str(i)]={"link":link[0],"img":picture[0],"title":title[0].replace(u"\xa0"," ")} - except: - data[str(i)]={"link":link[0],"img":"","title":title[0].replace(u"\xa0"," ")} - - return data +def ELPAISSC(): + return return_information( + "https://feeds.elpais.com/mrss-s/pages/ep/site/elpais.com/portada", + "item", + "title", + "link", + "description", + "media:content", + "width", + "height", + "url", + ) diff --git a/newsscrapper/fr.py b/newsscrapper/fr.py index 08692d8..569537d 100644 --- a/newsscrapper/fr.py +++ b/newsscrapper/fr.py @@ -1,117 +1,57 @@ -import requests -import re +from utils import return_information -def MondeSC(): - Monde=requests.get("https://www.lemonde.fr/rss/une.xml") - armonde=re.findall(r'(.*?)',Monde.text) - data={} - for i in range(len(armonde)): - picture=re.findall(r'(.*?)',armonde[i]) - title=re.findall(r'<!\[CDATA\[(.*?)]]>',armonde[i]) - try: - data[str(i)]={"link":link[0],"img":picture[0],"title":title[0].replace(u"\xa0"," ")} - except: - data[str(i)]={"link":link[0],"img":"","title":title[0].replace(u"\xa0"," ")} - - return data -def ParisSC(): - Paris=requests.get("https://www.leparisien.fr/arcio/sitemap/master/") - - s=Paris.text - s=s.replace("\n","") - arparis=re.findall(r'<url>(.*?)</url>',s) - data={} - for i in range(len(arparis)): - picture=re.findall(r'<image:loc>(.*?)</image:loc>',arparis[i]) - link=re.findall(r'<loc>(.*?)</loc>',arparis[i]) - title=link[0].split("/") - titre=title[-1].split("-") - fini=" ".join(titre[0:len(titre)-4]) - try: - data[str(i)]={"link":link[0],"img":picture[0],"title":fini} - except: - data[str(i)]={"link":link[0],"img":"","title":fini} - return data +def MondeSC(): + return return_information( + "https://www.lemonde.fr/rss/une.xml", + "item", + "title", + "link", + "description", + "media:content", + "width", + "height", + "url", + ) def MediaPartSC(): - MediaPart=requests.get("https://www.mediapart.fr/articles/feed") - s=MediaPart.text - s=s.replace("\n","") - armediapart=re.findall(r'<item>(.*?)</item>',s) - data={} - for i in range(len(armediapart)): - picture=re.findall(r'<media:content url="(.*?)"',armediapart[i]) - link=re.findall(r'<link>(.*?)</link>',armediapart[i]) - title=re.findall(r'<title><!\[CDATA\[(.*?)]]>',armediapart[i]) - - try: - data[str(i)]={"link":link[0],"img":picture[0],"title":title[0].replace(u"\xa0"," ")} - except: - data[str(i)]={"link":link[0],"img":"","title":title[0].replace(u"\xa0"," ")} - return data - + return return_information( + "https://www.mediapart.fr/articles/feed", + "item", + "title", + "link", + "description", + "media:content", + "width", + "height", + "url", + ) def BmftvSC(): - Bfmtv=requests.get("https://www.bfmtv.com/rss/news-24-7/") - s=Bfmtv.text - s=s.replace("\n","") - arbfmtv=re.findall(r'<item>(.*?)</item>',s) - data={} - for i in range(len(arbfmtv)): - picture=re.findall(r'<enclosure url="(.*?)"',arbfmtv[i]) - link=re.findall(r'<link>(.*?)</link>',arbfmtv[i]) - title=re.findall(r'<title><!\[CDATA\[(.*?)]]>',arbfmtv[i]) - - try: - data[str(i)]={"link":link[0],"img":picture[0],"title":title[0].replace(u"\xa0"," ")} - except: - data[str(i)]={"link":link[0],"img":"","title":title[0].replace(u"\xa0"," ")} - return data + return return_information( + "https://www.bfmtv.com/rss/news-24-7/", + "item", + "title", + "link", + "description", + "enclosure", + "width", + "height", + "url", + ) def LibeSC(): - Liberation=requests.get("https://www.liberation.fr/arc/outboundfeeds/rss/?outputType=xml") - s=Liberation.text - s=s.replace("\n","") - arlibe=re.findall(r'<item>(.*?)</item>',s) - data={} - for i in range(len(arlibe)): - picture=re.findall(r'https://liberation-liberation-prod.cdn.arcpublishing.com/resizer/(.*?)">',arlibe[i]) - link=re.findall(r'<link>(.*?)</link>',arlibe[i]) - title=re.findall(r'<title>(.*?)',arlibe[i]) - title[0]=title[0].replace("","") - addj="https://liberation-liberation-prod.cdn.arcpublishing.com/resizer/" - try: - data[str(i)]={"link":link[0],"img":addj+picture[0],"title":title[0].replace(u"\xa0"," ")} - except: - data[str(i)]={"link":link[0],"img":"","title":title[0].replace(u"\xa0"," ")} - return data - - - -def FTISC(): - Francetvinfo=requests.get("https://www.francetvinfo.fr/titres.rss") - s=Francetvinfo.text - s=s.replace("\n","") - arfti=re.findall('(.*?)',s) - data={} - for i in range(len(arfti)): - picture=re.findall(r'url="(.*?)"',arfti[i]) - link=re.findall(r'(.*?)',arfti[i]) - title=re.findall(r'(.*?)',arfti[i]) - - try: - data[str(i)]={"link":link[0],"img":picture[0],"title":title[0].replace(u"\xa0"," ")} - except: - data[str(i)]={"link":link[0],"img":"","title":title[0].replace(u"\xa0"," ")} - return data - - - - - - + return return_information( + "https://www.liberation.fr/arc/outboundfeeds/rss/?outputType=xml", + "item", + "title", + "link", + "description", + "media:content", + "width", + "height", + "url", + ) diff --git a/newsscrapper/utils.py b/newsscrapper/utils.py new file mode 100644 index 0000000..2fd2b93 --- /dev/null +++ b/newsscrapper/utils.py @@ -0,0 +1,59 @@ +from bs4 import BeautifulSoup +import requests + + +def return_information( + url, + tag_article, + tag_title, + tag_link, + tag_description, + tag_image, + tag_image_h, + tag_image_w, + tag_image_link, +): + website = requests.get(url) + data = {} + content = website.content + soup = BeautifulSoup(content, "xml") + items = soup.find_all(tag_article) + for count, item in enumerate(items): + + try: + title = item.find(tag_title).text + except AttributeError: + title = None + + try: + link = item.find(tag_link).text + except AttributeError: + link = None + + try: + description = item.find(tag_description).text + except AttributeError: + description = None + + images = {} + for count_image, image in enumerate(item.find_all(tag_image)): + try: + h = image.get(tag_image_h) + w = image.get(tag_image_w) + if h is None or w is None: + key = count_image + else: + key = f"{h}x{w}" + link_image = image.get(tag_image_link) + images[key] = link_image + except: + pass + + data[f"{count}"] = { + "link": link, + "images": images, + "title": title, + "description": description, + } + + return data diff --git a/setup.py b/setup.py index f175196..2ebd3cf 100644 --- a/setup.py +++ b/setup.py @@ -4,18 +4,18 @@ long_description = fh.read() setuptools.setup( - name="newsscrapper-SamirPS", # Replace with your own username - version="0.1.7", + name="newsscrapper-SamirPS", # Replace with your own username + version="1.0.0", author="SamirPS", long_description=long_description, long_description_content_type="text/markdown", url="https://github.com/SamirPS/News-Scrapper", - install_requires=["requests"], + install_requires=["requests", "bs4", "lxml"], packages=setuptools.find_packages(), classifiers=[ "Programming Language :: Python :: 3", "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", ], - python_requires='>=3.6', + python_requires=">=3.6", )