Skip to content

Commit

Permalink
Now, use bs4 instead of regex to get the article's information
Browse files Browse the repository at this point in the history
and create a function to simplify the code.

On es.py, remove EFE.
On fr.py, remove LeParisien,FranceTv Info.
  • Loading branch information
Samir Akarioh authored and SamirPS committed Dec 7, 2022
1 parent d38a024 commit 73e440d
Show file tree
Hide file tree
Showing 7 changed files with 185 additions and 234 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ dist
newsscrapper_SamirPS.egg-info
Pipfile
Pipfile.lock
newsscrapper/__pycache__/
13 changes: 5 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,19 +1,15 @@
# News-Scrapper
Web Scrapper for news website get the headlines : https://pypi.org/project/newsscrapper-SamirPS/
POC : https://newswebs.herokuapp.com/
TODO: Get the title of the article
Get the description of the article
Web Scrapper for gettings the headlines of some news websites: https://pypi.org/project/newsscrapper-SamirPS/

To import the function :
## FR:
```python
from newsscrapper import fr
```
* LeMonde : fr.MondeSC()
* LeParisien: fr.ParisSC()
* MediaPart: fr.MediaPartSC()
* BFMTV : fr.BmftvSC()
* Libération : fr.LibeSC()
* FranceTv INFO : fr.LibeSC()


## EN:
Expand All @@ -31,7 +27,8 @@ from newsscrapper import en
```python
from newsscrapper import es
```
* EFE : es.EFESC()
* ELPAIS: es.ELPAISSC()

The function return a dict like {"1":{"link":X,"img":Y,"title":Z}}
The function return a dict like {"1":{"link":A,"images":{key_image:B,...},"title":C,"description":D}}

The ... is for multiple images; key_image can be the "heightxweight" of the picture or a string representing an integer if we don't have the height or weight.
136 changes: 57 additions & 79 deletions newsscrapper/en.py
Original file line number Diff line number Diff line change
@@ -1,93 +1,71 @@
import requests
import re
from utils import return_information

def CNNSC():
CNN=requests.get("http://rss.cnn.com/rss/edition")
arcnn=re.findall(r'<item>(.*?)</item>',CNN.text)
data={}
for i in range(len(arcnn)):

picture=re.findall(r'<media:content medium="image" url="(.*?)"',arcnn[i])
link=re.findall(r'<link>(.*?)</link>',arcnn[i])
title=re.findall(r'<title><!\[CDATA\[(.*?)]]>',arcnn[i])
try:
data[str(i)]={"link":link[0],"img":picture[0],"title":title[0].replace(u"\xa0"," ")}
except:
data[str(i)]={"link":link[0],"img":"","title":title[0].replace(u"\xa0"," ")}

return data
def CNNSC():
return return_information(
"http://rss.cnn.com/rss/edition",
"item",
"title",
"link",
"description",
"media:content",
"width",
"height",
"url",
)


def FoxNewsSC():
FoxNews=requests.get("http://feeds.foxnews.com/foxnews/latest")
s=FoxNews.text
s=s.replace("\n","")
arfoxnews=re.findall(r'<item>(.*?)</item>',s)
data={}
for i in range(len(arfoxnews)):
picture=re.findall(r'<media:content url="(.*?)"',arfoxnews[i])
link=re.findall(r'<guid isPermaLink="true">(.*?)</guid>',arfoxnews[i])
title=re.findall(r'<title>(.*?)</title>',arfoxnews[i])
try:
data[str(i)]={"link":link[0],"img":picture[0],"title":title[0].replace(u"\xa0"," ")}
except:
data[str(i)]={"link":link[0],"img":"","title":title[0].replace(u"\xa0"," ")}
return return_information(
"http://feeds.foxnews.com/foxnews/latest",
"item",
"title",
"guid",
"description",
"media:content",
"width",
"height",
"url",
)

return data

def ABCNewsSC():
ABCNews=requests.get("https://abcnews.go.com/abcnews/moneyheadlines")
s=ABCNews.text
s=s.replace("\n","")
arabcnews=re.findall(r'<item>(.*?)</item>',s)
data={}
for i in range(len(arabcnews)):
picture=re.findall(r'<media:thumbnail url="(.*?)"',arabcnews[i])
link=re.findall(r'<link><!\[CDATA\[(.*?)]]>',arabcnews[i])
title=re.findall(r'<title><!\[CDATA\[(.*?)]]',arabcnews[i])
try:
data[str(i)]={"link":link[0],"img":picture[0],"title":title[0].replace(u"\xa0"," ")}
except:
data[str(i)]={"link":link[0],"img":"","title":title[0].replace(u"\xa0"," ")}

return data

return return_information(
"https://abcnews.go.com/abcnews/moneyheadlines",
"item",
"title",
"link",
"description",
"media:thumbnail",
"width",
"height",
"url",
)


def TheGuardianSC():
TheGuardianSC=requests.get("https://www.theguardian.com/us-news/rss")
s=TheGuardianSC.text
s=s.replace("\n","")
arguardian=re.findall(r'<item>(.*?)</item>',s)
data={}
for i in range(len(arguardian)):
picture=re.findall(r'<media:content width="140" url="(.*?)"',arguardian[i])
link=re.findall(r'<link>(.*?)</link>',arguardian[i])

title=re.findall(r'<title>(.*?)</title>',arguardian[i])
try:
picture[0]=picture[0].replace("amp;","")
data[str(i)]={"link":link[0],"img":picture[0],"title":title[0].replace(u"\xa0"," ")}
except:
data[str(i)]={"link":link[0],"img":"","title":title[0].replace(u"\xa0"," ")}
return return_information(
"https://www.theguardian.com/us-news/rss",
"item",
"title",
"guid",
"description",
"media:content",
"width",
"height",
"url",
)

return data

def TheNewYorkTimesSC():
TheNewYorkTimesSC=requests.get("https://www.nytimes.com/services/xml/rss/nyt/HomePage.xml")
s=TheNewYorkTimesSC.text
s=s.replace("\n","")
arnytimes=re.findall(r'<item>(.*?)</item>',s)
data={}

for i in range(len(arnytimes)):
picture=re.findall(r'medium="image" url="(.*?)" width',arnytimes[i])
link=re.findall(r'<link>(.*?)</link>',arnytimes[i])
title=re.findall(r'<title>(.*?)</title>',arnytimes[i])
try:
data[str(i)]={"link":link[0],"img":picture[0],"title":title[0].replace(u"\xa0"," ")}
except:
data[str(i)]={"link":link[0],"img":"","title":title[0].replace(u"\xa0"," ")}

return data

return return_information(
"https://www.nytimes.com/services/xml/rss/nyt/HomePage.xml",
"item",
"title",
"link",
"description",
"media:content",
"width",
"height",
"url",
)
50 changes: 13 additions & 37 deletions newsscrapper/es.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,15 @@
import requests
import re
from utils import return_information

def ELPAISSC():
ELPAIS=requests.get("http://ep00.epimg.net/rss/tags/ultimas_noticias.xml")
s=ELPAIS.text.replace("\n",'')
arelpais=re.findall(r'<item>(.*?)</item>',s)
data={}
for i in range(len(arelpais)):

picture=re.findall(r'<enclosure url="(.*?)" length',arelpais[i])
link=re.findall(r'<link><!\[CDATA\[(.*?)]]>',arelpais[i])
title=re.findall(r'<title><!\[CDATA\[(.*?)]]>',arelpais[i])

try:
data[str(i)]={"link":link[0],"img":picture[0],"title":title[0].replace(u"\xa0"," ")}
except:
data[str(i)]={"link":link[0],"img":"","title":title[0].replace(u"\xa0"," ")}

return data

def EFESC():
EFE=requests.get("https://www.efe.com/efe/espana/1/rss")
s=EFE.text.replace("\n",'')
arefe=re.findall(r'<item>(.*?)</item>',s)
data={}
for i in range(len(arefe)):

picture=re.findall(r'<enclosure url="(.*?)" length',arefe[i])
link=re.findall(r'<link>(.*?)?utm_source',arefe[i])
title=re.findall(r'<title>(.*?)</title>',arefe[i])

try:
data[str(i)]={"link":link[0],"img":picture[0],"title":title[0].replace(u"\xa0"," ")}
except:
data[str(i)]={"link":link[0],"img":"","title":title[0].replace(u"\xa0"," ")}

return data

def ELPAISSC():
return return_information(
"https://feeds.elpais.com/mrss-s/pages/ep/site/elpais.com/portada",
"item",
"title",
"link",
"description",
"media:content",
"width",
"height",
"url",
)
Loading

0 comments on commit 73e440d

Please sign in to comment.