diff --git a/.gitignore b/.gitignore
index c673472..977db3d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,4 @@ dist
newsscrapper_SamirPS.egg-info
Pipfile
Pipfile.lock
+newsscrapper/__pycache__/
diff --git a/README.md b/README.md
index c70b6ae..33e3188 100644
--- a/README.md
+++ b/README.md
@@ -1,19 +1,15 @@
# News-Scrapper
-Web Scrapper for news website get the headlines : https://pypi.org/project/newsscrapper-SamirPS/
-POC : https://newswebs.herokuapp.com/
-TODO: Get the title of the article
- Get the description of the article
+Web Scrapper for gettings the headlines of some news websites: https://pypi.org/project/newsscrapper-SamirPS/
+
To import the function :
## FR:
```python
from newsscrapper import fr
```
* LeMonde : fr.MondeSC()
-* LeParisien: fr.ParisSC()
* MediaPart: fr.MediaPartSC()
* BFMTV : fr.BmftvSC()
* Libération : fr.LibeSC()
-* FranceTv INFO : fr.LibeSC()
## EN:
@@ -31,7 +27,8 @@ from newsscrapper import en
```python
from newsscrapper import es
```
-* EFE : es.EFESC()
* ELPAIS: es.ELPAISSC()
-The function return a dict like {"1":{"link":X,"img":Y,"title":Z}}
+The function return a dict like {"1":{"link":A,"images":{key_image:B,...},"title":C,"description":D}}
+
+The ... is for multiple images; key_image can be the "heightxweight" of the picture or a string representing an integer if we don't have the height or weight.
\ No newline at end of file
diff --git a/newsscrapper/en.py b/newsscrapper/en.py
index 846a0d7..2376a45 100644
--- a/newsscrapper/en.py
+++ b/newsscrapper/en.py
@@ -1,93 +1,71 @@
-import requests
-import re
+from utils import return_information
-def CNNSC():
- CNN=requests.get("http://rss.cnn.com/rss/edition")
- arcnn=re.findall(r'- (.*?)
',CNN.text)
- data={}
- for i in range(len(arcnn)):
-
- picture=re.findall(r'(.*?)',arcnn[i])
- title=re.findall(r'',arcnn[i])
- try:
- data[str(i)]={"link":link[0],"img":picture[0],"title":title[0].replace(u"\xa0"," ")}
- except:
- data[str(i)]={"link":link[0],"img":"","title":title[0].replace(u"\xa0"," ")}
- return data
+def CNNSC():
+ return return_information(
+ "http://rss.cnn.com/rss/edition",
+ "item",
+ "title",
+ "link",
+ "description",
+ "media:content",
+ "width",
+ "height",
+ "url",
+ )
def FoxNewsSC():
- FoxNews=requests.get("http://feeds.foxnews.com/foxnews/latest")
- s=FoxNews.text
- s=s.replace("\n","")
- arfoxnews=re.findall(r'- (.*?)
',s)
- data={}
- for i in range(len(arfoxnews)):
- picture=re.findall(r'(.*?)',arfoxnews[i])
- title=re.findall(r'(.*?)',arfoxnews[i])
- try:
- data[str(i)]={"link":link[0],"img":picture[0],"title":title[0].replace(u"\xa0"," ")}
- except:
- data[str(i)]={"link":link[0],"img":"","title":title[0].replace(u"\xa0"," ")}
+ return return_information(
+ "http://feeds.foxnews.com/foxnews/latest",
+ "item",
+ "title",
+ "guid",
+ "description",
+ "media:content",
+ "width",
+ "height",
+ "url",
+ )
- return data
def ABCNewsSC():
- ABCNews=requests.get("https://abcnews.go.com/abcnews/moneyheadlines")
- s=ABCNews.text
- s=s.replace("\n","")
- arabcnews=re.findall(r'- (.*?)
',s)
- data={}
- for i in range(len(arabcnews)):
- picture=re.findall(r'',arabcnews[i])
- title=re.findall(r'(.*?)',s)
- data={}
- for i in range(len(arguardian)):
- picture=re.findall(r'(.*?)',arguardian[i])
-
- title=re.findall(r'(.*?)',arguardian[i])
- try:
- picture[0]=picture[0].replace("amp;","")
- data[str(i)]={"link":link[0],"img":picture[0],"title":title[0].replace(u"\xa0"," ")}
- except:
- data[str(i)]={"link":link[0],"img":"","title":title[0].replace(u"\xa0"," ")}
+ return return_information(
+ "https://www.theguardian.com/us-news/rss",
+ "item",
+ "title",
+ "guid",
+ "description",
+ "media:content",
+ "width",
+ "height",
+ "url",
+ )
- return data
def TheNewYorkTimesSC():
- TheNewYorkTimesSC=requests.get("https://www.nytimes.com/services/xml/rss/nyt/HomePage.xml")
- s=TheNewYorkTimesSC.text
- s=s.replace("\n","")
- arnytimes=re.findall(r'- (.*?)
',s)
- data={}
-
- for i in range(len(arnytimes)):
- picture=re.findall(r'medium="image" url="(.*?)" width',arnytimes[i])
- link=re.findall(r'(.*?)',arnytimes[i])
- title=re.findall(r'(.*?)',arnytimes[i])
- try:
- data[str(i)]={"link":link[0],"img":picture[0],"title":title[0].replace(u"\xa0"," ")}
- except:
- data[str(i)]={"link":link[0],"img":"","title":title[0].replace(u"\xa0"," ")}
-
- return data
-
+ return return_information(
+ "https://www.nytimes.com/services/xml/rss/nyt/HomePage.xml",
+ "item",
+ "title",
+ "link",
+ "description",
+ "media:content",
+ "width",
+ "height",
+ "url",
+ )
diff --git a/newsscrapper/es.py b/newsscrapper/es.py
index 3f46215..27d3572 100644
--- a/newsscrapper/es.py
+++ b/newsscrapper/es.py
@@ -1,39 +1,15 @@
-import requests
-import re
+from utils import return_information
-def ELPAISSC():
- ELPAIS=requests.get("http://ep00.epimg.net/rss/tags/ultimas_noticias.xml")
- s=ELPAIS.text.replace("\n",'')
- arelpais=re.findall(r'- (.*?)
',s)
- data={}
- for i in range(len(arelpais)):
-
- picture=re.findall(r'',arelpais[i])
- title=re.findall(r'',arelpais[i])
-
- try:
- data[str(i)]={"link":link[0],"img":picture[0],"title":title[0].replace(u"\xa0"," ")}
- except:
- data[str(i)]={"link":link[0],"img":"","title":title[0].replace(u"\xa0"," ")}
-
- return data
-
-def EFESC():
- EFE=requests.get("https://www.efe.com/efe/espana/1/rss")
- s=EFE.text.replace("\n",'')
- arefe=re.findall(r'- (.*?)
',s)
- data={}
- for i in range(len(arefe)):
-
- picture=re.findall(r'(.*?)?utm_source',arefe[i])
- title=re.findall(r'(.*?)',arefe[i])
-
- try:
- data[str(i)]={"link":link[0],"img":picture[0],"title":title[0].replace(u"\xa0"," ")}
- except:
- data[str(i)]={"link":link[0],"img":"","title":title[0].replace(u"\xa0"," ")}
-
- return data
+def ELPAISSC():
+ return return_information(
+ "https://feeds.elpais.com/mrss-s/pages/ep/site/elpais.com/portada",
+ "item",
+ "title",
+ "link",
+ "description",
+ "media:content",
+ "width",
+ "height",
+ "url",
+ )
diff --git a/newsscrapper/fr.py b/newsscrapper/fr.py
index 08692d8..569537d 100644
--- a/newsscrapper/fr.py
+++ b/newsscrapper/fr.py
@@ -1,117 +1,57 @@
-import requests
-import re
+from utils import return_information
-def MondeSC():
- Monde=requests.get("https://www.lemonde.fr/rss/une.xml")
- armonde=re.findall(r'- (.*?)
',Monde.text)
- data={}
- for i in range(len(armonde)):
- picture=re.findall(r'(.*?)',armonde[i])
- title=re.findall(r'',armonde[i])
- try:
- data[str(i)]={"link":link[0],"img":picture[0],"title":title[0].replace(u"\xa0"," ")}
- except:
- data[str(i)]={"link":link[0],"img":"","title":title[0].replace(u"\xa0"," ")}
-
- return data
-def ParisSC():
- Paris=requests.get("https://www.leparisien.fr/arcio/sitemap/master/")
-
- s=Paris.text
- s=s.replace("\n","")
- arparis=re.findall(r'(.*?)',s)
- data={}
- for i in range(len(arparis)):
- picture=re.findall(r'(.*?)',arparis[i])
- link=re.findall(r'(.*?)',arparis[i])
- title=link[0].split("/")
- titre=title[-1].split("-")
- fini=" ".join(titre[0:len(titre)-4])
- try:
- data[str(i)]={"link":link[0],"img":picture[0],"title":fini}
- except:
- data[str(i)]={"link":link[0],"img":"","title":fini}
- return data
+def MondeSC():
+ return return_information(
+ "https://www.lemonde.fr/rss/une.xml",
+ "item",
+ "title",
+ "link",
+ "description",
+ "media:content",
+ "width",
+ "height",
+ "url",
+ )
def MediaPartSC():
- MediaPart=requests.get("https://www.mediapart.fr/articles/feed")
- s=MediaPart.text
- s=s.replace("\n","")
- armediapart=re.findall(r'- (.*?)
',s)
- data={}
- for i in range(len(armediapart)):
- picture=re.findall(r'(.*?)',armediapart[i])
- title=re.findall(r'',armediapart[i])
-
- try:
- data[str(i)]={"link":link[0],"img":picture[0],"title":title[0].replace(u"\xa0"," ")}
- except:
- data[str(i)]={"link":link[0],"img":"","title":title[0].replace(u"\xa0"," ")}
- return data
-
+ return return_information(
+ "https://www.mediapart.fr/articles/feed",
+ "item",
+ "title",
+ "link",
+ "description",
+ "media:content",
+ "width",
+ "height",
+ "url",
+ )
def BmftvSC():
- Bfmtv=requests.get("https://www.bfmtv.com/rss/news-24-7/")
- s=Bfmtv.text
- s=s.replace("\n","")
- arbfmtv=re.findall(r'- (.*?)
',s)
- data={}
- for i in range(len(arbfmtv)):
- picture=re.findall(r'(.*?)',arbfmtv[i])
- title=re.findall(r'',arbfmtv[i])
-
- try:
- data[str(i)]={"link":link[0],"img":picture[0],"title":title[0].replace(u"\xa0"," ")}
- except:
- data[str(i)]={"link":link[0],"img":"","title":title[0].replace(u"\xa0"," ")}
- return data
+ return return_information(
+ "https://www.bfmtv.com/rss/news-24-7/",
+ "item",
+ "title",
+ "link",
+ "description",
+ "enclosure",
+ "width",
+ "height",
+ "url",
+ )
def LibeSC():
- Liberation=requests.get("https://www.liberation.fr/arc/outboundfeeds/rss/?outputType=xml")
- s=Liberation.text
- s=s.replace("\n","")
- arlibe=re.findall(r'- (.*?)
',s)
- data={}
- for i in range(len(arlibe)):
- picture=re.findall(r'https://liberation-liberation-prod.cdn.arcpublishing.com/resizer/(.*?)">',arlibe[i])
- link=re.findall(r'(.*?)',arlibe[i])
- title=re.findall(r'(.*?)',arlibe[i])
- title[0]=title[0].replace("","")
- addj="https://liberation-liberation-prod.cdn.arcpublishing.com/resizer/"
- try:
- data[str(i)]={"link":link[0],"img":addj+picture[0],"title":title[0].replace(u"\xa0"," ")}
- except:
- data[str(i)]={"link":link[0],"img":"","title":title[0].replace(u"\xa0"," ")}
- return data
-
-
-
-def FTISC():
- Francetvinfo=requests.get("https://www.francetvinfo.fr/titres.rss")
- s=Francetvinfo.text
- s=s.replace("\n","")
- arfti=re.findall('- (.*?)
',s)
- data={}
- for i in range(len(arfti)):
- picture=re.findall(r'url="(.*?)"',arfti[i])
- link=re.findall(r'(.*?)',arfti[i])
- title=re.findall(r'(.*?)',arfti[i])
-
- try:
- data[str(i)]={"link":link[0],"img":picture[0],"title":title[0].replace(u"\xa0"," ")}
- except:
- data[str(i)]={"link":link[0],"img":"","title":title[0].replace(u"\xa0"," ")}
- return data
-
-
-
-
-
-
+ return return_information(
+ "https://www.liberation.fr/arc/outboundfeeds/rss/?outputType=xml",
+ "item",
+ "title",
+ "link",
+ "description",
+ "media:content",
+ "width",
+ "height",
+ "url",
+ )
diff --git a/newsscrapper/utils.py b/newsscrapper/utils.py
new file mode 100644
index 0000000..2fd2b93
--- /dev/null
+++ b/newsscrapper/utils.py
@@ -0,0 +1,59 @@
+from bs4 import BeautifulSoup
+import requests
+
+
+def return_information(
+ url,
+ tag_article,
+ tag_title,
+ tag_link,
+ tag_description,
+ tag_image,
+ tag_image_h,
+ tag_image_w,
+ tag_image_link,
+):
+ website = requests.get(url)
+ data = {}
+ content = website.content
+ soup = BeautifulSoup(content, "xml")
+ items = soup.find_all(tag_article)
+ for count, item in enumerate(items):
+
+ try:
+ title = item.find(tag_title).text
+ except AttributeError:
+ title = None
+
+ try:
+ link = item.find(tag_link).text
+ except AttributeError:
+ link = None
+
+ try:
+ description = item.find(tag_description).text
+ except AttributeError:
+ description = None
+
+ images = {}
+ for count_image, image in enumerate(item.find_all(tag_image)):
+ try:
+ h = image.get(tag_image_h)
+ w = image.get(tag_image_w)
+ if h is None or w is None:
+ key = count_image
+ else:
+ key = f"{h}x{w}"
+ link_image = image.get(tag_image_link)
+ images[key] = link_image
+ except:
+ pass
+
+ data[f"{count}"] = {
+ "link": link,
+ "images": images,
+ "title": title,
+ "description": description,
+ }
+
+ return data
diff --git a/setup.py b/setup.py
index f175196..2ebd3cf 100644
--- a/setup.py
+++ b/setup.py
@@ -4,18 +4,18 @@
long_description = fh.read()
setuptools.setup(
- name="newsscrapper-SamirPS", # Replace with your own username
- version="0.1.7",
+ name="newsscrapper-SamirPS", # Replace with your own username
+ version="1.0.0",
author="SamirPS",
long_description=long_description,
long_description_content_type="text/markdown",
url="https://github.com/SamirPS/News-Scrapper",
- install_requires=["requests"],
+ install_requires=["requests", "bs4", "lxml"],
packages=setuptools.find_packages(),
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
],
- python_requires='>=3.6',
+ python_requires=">=3.6",
)