Skip to content

Commit

Permalink
Added OPML import support.
Browse files Browse the repository at this point in the history
Added parameter of minimal interval between requests to same domain to avoid flood.
Removed global state log from the library, log is passed as parameters.
Fixed test. Added OPML test.
Moved screenshot images out of library directory.
Changed flags to be more verbose.
Simplified CTRL+C to exit code.
Other refactorings.
Update() is still way too convoluted and could be broken down a bit.
  • Loading branch information
ww9 committed Nov 20, 2018
1 parent d0c13e0 commit 91dfdd7
Show file tree
Hide file tree
Showing 13 changed files with 285 additions and 90 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ debug.test
/feed/test_data/news
/docs
/news
/release
/release
debug
30 changes: 19 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ News is a minimalist RSS/Atom aggregator that saves to HTML files.

That's it! No database, no configuration files, no HTTP server, no ads, no tracking and no JavaScript. Everything is stored in the HTML files which look like this:

![screenshot](feed/screenshot.png)
![screenshot](screenshot.png)

## Usage

Expand All @@ -23,19 +23,27 @@ When `📰index.html` grows large (1000 items by default), the oldest 500 items

`📂news` can reside in Google Drive or Dropbox for easy access everywhere. This is how I use it:

![screenshot](feed/demo.gif)
![screenshot](demo.gif)

## Command-line arguments
`news -h` prints:
```
-d string directory to save html files in. "./news" is used by default and created if necessary
-i int minutes to wait between updates (default 10)
-n int number of items per .html file. A new page.html file is created whenever
index.html contains 2x that number (default 500)
-t int timeout in seconds when fetching feeds (default 10)
-c string optional custom Go html/template file to to use when generating .html files.
See `news/feed/template.go` in source for an example
-v verbose mode outputs extra info when enabled
-d, dir string
directory to store html files. By default ./news is used and created if necessary
-items int
number of items per page.html file. A new page.html file is created whenever index.html contains 2x that number (default 500)
-noflood int
minium seconds between calls to same domain to avoid flooding (default 30)
-opml string
path to OPML file containing feed URLS to be imported. Existing feed URLs are ovewritten, not duplicated
-template news/feed/template.go
custom Go html/template file to use when generating .html files. See news/feed/template.go
-timeout int
timeout in seconds when fetching feeds (default 10)
-v, verbose
verbose mode outputs extra info when enabled
-wait int
minutes to wait between updates (default 10)
```

## Running from code
Expand All @@ -57,4 +65,4 @@ Windows, Linux and OSX binaries are available in [Releases](https://github.com/w

## License

Dedicated to [Public Domain](https://gist.github.com/ww9/4c4481fb7b55186960a34266078c88b1). Do whatever you want with it, including changing the license and omitting the author.
[The Unlicense](http://unlicense.org/), [Public Domain](https://gist.github.com/ww9/4c4481fb7b55186960a34266078c88b1). As free as it gets.
2 changes: 1 addition & 1 deletion Taskfile.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,4 +45,4 @@ tasks:
gdrive:
desc: Example usage with Google Drive on Windows with verbose output
cmds:
- go run main.go -d "D:/gdrive/news"
- go run main.go -dir "D:/gdrive/news"
File renamed without changes
121 changes: 88 additions & 33 deletions feed/feed.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"strings"
"time"

"github.com/gilliek/go-opml/opml"
"github.com/mmcdole/gofeed"
"github.com/sirupsen/logrus"

Expand All @@ -26,7 +27,7 @@ type Item struct {
Tag string
}

// Aggregator is the core structure than fetches feeds and saves them to html
// Aggregator is the core structure than fetches feeds and saves them to html. See Aggregator.Update()
type Aggregator struct {
Items []Item // Ordered from newest to oldest. Always prepend new items.
// Feeds is a map of URLs -> Titles for feeds. This needs to be stored somewhere so reader knows from where to fetch news
Expand All @@ -38,23 +39,19 @@ type Aggregator struct {
pages int
ItemsPerPage int
NextPage int
}

var log = logrus.New()

// SetLogger is used from main to set custom unified logger
func SetLogger(logger *logrus.Logger) {
log = logger
log *logrus.Logger
}

// New creates an Aggregator with default URL fetcher
func New(directory string) (*Aggregator, error) {
client := &http.Client{Timeout: 10 * time.Second}
return NewWithCustom(directory, 1000, MakeURLFetcher(client))
log := logrus.New()
return NewWithCustom(log, directory, 1000, MakeURLFetcher(log, 30*time.Second, client))
}

// NewWithCustom allows for creating customized Aggregators such as custom URL fetcher for testing or with custom http.client
func NewWithCustom(directory string, itemsPerPage int, URLFetcher func(URL string) ([]byte, error)) (*Aggregator, error) {
// minDomainRequestInterval is the minimum time we must wait between calls to same domain. Aka debouncer. For cases like multiple reddit.com feeds.
func NewWithCustom(log *logrus.Logger, directory string, itemsPerPage int, URLFetcher func(URL string) ([]byte, error)) (*Aggregator, error) {
if directory == "" {
directory = "news"
}
Expand All @@ -66,12 +63,13 @@ func NewWithCustom(directory string, itemsPerPage int, URLFetcher func(URL strin
URLFetcher: URLFetcher,
ItemsPerPage: itemsPerPage,
pages: 1,
log: log,
}

if !fileExists(agg.Directory) {
if agg.Directory == "news" {
if errDir := os.Mkdir(agg.Directory, os.ModeDir); errDir != nil {
return nil, fmt.Errorf("couldn't create dirextory: %s", errDir)
return nil, fmt.Errorf("couldn't create directory: %s", errDir)
}
} else {
return nil, fmt.Errorf("directory %s does not exist", agg.Directory)
Expand All @@ -85,11 +83,11 @@ func NewWithCustom(directory string, itemsPerPage int, URLFetcher func(URL strin
log.Infof("Created %s with sample feeds.\n", indexFile)
}

return agg, agg.loadKnownURLs()
return agg, agg.loadFeedsAndItemsFromHTMLFiles()
}

// feedXMLParser returns items ordered from oldest to newest. So we can always just append as long as template reads in inverted order.
func feedXMLParser(XML []byte) (items []Item, err error) {
// parseXML returns items ordered from oldest to newest. So we can always just append as long as template reads in inverted order.
func (agg *Aggregator) parseXML(XML []byte) (items []Item, err error) {
cleanXML := cleanXML(XML)
items = make([]Item, 0)
parser := gofeed.NewParser()
Expand All @@ -109,7 +107,7 @@ func feedXMLParser(XML []byte) (items []Item, err error) {
itemURL = strings.TrimSpace(item.Custom["Comments"])
}
if itemURL == "" {
log.Debugf("skipping item from feed %s due to lack of URL", feed.Link)
agg.log.Debugf("skipping item from feed %s due to lack of URL", feed.Link)
continue
}
itemTitle := strings.TrimSpace(item.Title)
Expand All @@ -118,7 +116,7 @@ func feedXMLParser(XML []byte) (items []Item, err error) {
if itemTitle == "" {
itemTitle = itemURL
}
log.Debugf("using %s to fill in feed %s item empty description", itemTitle, feed.Link)
agg.log.Debugf("using %s to fill in feed %s item empty description", itemTitle, feed.Link)
}
items = append([]Item{{
Title: itemTitle,
Expand All @@ -131,12 +129,12 @@ func feedXMLParser(XML []byte) (items []Item, err error) {
// MakeURLFetcher is the default HTTP client used to fetch feed XML.
// The other one is fakeURLFetcher() used for testing.
// There's also a retired makeCachedURLFetcher() which was using during initial phases of development and is kept in misc.go
func MakeURLFetcher(client *http.Client) func(URL string) (content []byte, err error) {
antiFlood := makeURLDebouncer(30 * time.Second)
func MakeURLFetcher(log *logrus.Logger, minDomainRequestInterval time.Duration, client *http.Client) func(URL string) (content []byte, err error) {
antiFlood := makeURLDebouncer(log, minDomainRequestInterval)
return func(URL string) (content []byte, err error) {
req, err := http.NewRequest("GET", antiFlood(URL), nil)
if err != nil {
log.Fatalln(err)
return nil, fmt.Errorf("could not create GET request to URL %s : %s", URL, err)
}
req.Header.Set("User-Agent", uarand.GetRandom())
req.Header.Set("Accept", "application/xml")
Expand Down Expand Up @@ -230,7 +228,7 @@ func (item *Item) SetTag() {
}
}

func (agg *Aggregator) loadKnownURLs() error {
func (agg *Aggregator) loadFeedsAndItemsFromHTMLFiles() error {
for i := 1; ; i++ {
filePath := filepath.Clean(agg.Directory + "/index.html")
if i > 1 {
Expand All @@ -243,7 +241,7 @@ func (agg *Aggregator) loadKnownURLs() error {
break
}
agg.pages = i
log.Debugf("reading items from %s", filePath)
agg.log.Debugf("reading items from %s", filePath)
items, feeds, err := loadFromFile(filePath)
if err != nil {
return fmt.Errorf("could not load known URLs from file %s : %s", filePath, err)
Expand All @@ -258,14 +256,70 @@ func (agg *Aggregator) loadKnownURLs() error {
return nil
}

func createSampleIndex(file string) error {
return savePageToFile(file, []Item{}, map[string]string{
func getSampleFeeds() map[string]string {
return map[string]string{
"https://www.reddit.com/r/golang/.rss": "/r/golang",
"https://news.ycombinator.com/rss": "Hacker News",
}, 0)
}
}

func createSampleIndex(file string) error {
return savePageToFile(file, []Item{}, getSampleFeeds(), 0)
}

func (agg *Aggregator) ImportOPMLFile(filePath string) (importedFeeds int, err error) {
doc, err := opml.NewOPMLFromFile(filePath)
if err != nil {
return 0, err
}
feeds := make(map[string]string)
collectFeedsFromOPMLOutline(feeds, doc.Outlines())
if len(feeds) < 1 {
return 0, fmt.Errorf("no feed URLs found")
}
for URL, title := range feeds {
agg.Feeds[URL] = title
}
// Save feeds to index.html
indexFile := filepath.Clean(agg.Directory + "/index.html")
indexItems, _, err := loadFromFile(indexFile)
if err != nil {
return 0, fmt.Errorf("could not save imported feeds to %s: %s", indexFile, err)
}
if err := savePageToFile(indexFile, indexItems, agg.Feeds, agg.pages); err != nil {
return 0, fmt.Errorf("could not save imported feeds to %s: %s", indexFile, err)
}

return len(feeds), nil
}

// Apparently outlines can be recursive, so we must be able to dig deep
// Example 1: <outline text="24 ways" htmlUrl="http://24ways.org/" type="rss" xmlUrl="http://feeds.feedburner.com/24ways"/>
// Example 2:
// <outline title="News" text="News">
// <outline text="Big News Finland" title="Big News Finland" type="rss" xmlUrl="http://www.bignewsnetwork.com/?rss=37e8860164ce009a"/>
// <outline text="Euronews" title="Euronews" type="rss" xmlUrl="http://feeds.feedburner.com/euronews/en/news/"/>
// <outline text="Reuters Top News" title="Reuters Top News" type="rss" xmlUrl="http://feeds.reuters.com/reuters/topNews"/>
// <outline text="Yahoo Europe" title="Yahoo Europe" type="rss" xmlUrl="http://rss.news.yahoo.com/rss/europe"/>
// </outline>
func collectFeedsFromOPMLOutline(feeds map[string]string, outlines []opml.Outline) {
for _, outline := range outlines {

if outline.XMLURL != "" {
feeds[outline.XMLURL] = strings.TrimSpace(outline.Text)
// If feed title is empty, use URL instead
if feeds[outline.XMLURL] == "" {
feeds[outline.XMLURL] = outline.XMLURL
}
}
if len(outline.Outlines) > 0 {
collectFeedsFromOPMLOutline(feeds, outline.Outlines)
}
}
}

// Update load feeds from index.html, fetches items from them and save everything back to index.html. Also generates pageX.html if necessary.
// Update reads feed URLs from index.html, fetches RSS/Atom feed from each URL found and save everything back to index.html.
// Also generates new pageX.html files when index.html is too large.
func (agg *Aggregator) Update() (err error) {
indexFile := agg.Directory + "/index.html"
indexItems, feeds, err := loadFromFile(indexFile)
Expand All @@ -277,17 +331,18 @@ func (agg *Aggregator) Update() (err error) {
}
agg.Items = indexItems
agg.Feeds = feeds
// Access feeds in random order
suffledURLs := shuffleMapKeys(agg.Feeds)
for _, feedURL := range suffledURLs {
log.Debugf("reading items from %s", feedURL)
agg.log.Debugf("reading items from %s", feedURL)
contents, err := agg.URLFetcher(feedURL)
if err != nil {
log.Errorf("%s : %s", feedURL, err)
agg.log.Errorf("%s : %s", feedURL, err)
continue
}
items, err := feedXMLParser(contents)
items, err := agg.parseXML(contents)
if err != nil {
log.Errorf("%s: %s", feedURL, err)
agg.log.Errorf("%s: %s", feedURL, err)
continue
}
for i := len(items) - 1; i >= 0; i-- {
Expand All @@ -302,22 +357,22 @@ func (agg *Aggregator) Update() (err error) {
for len(agg.Items) >= agg.ItemsPerPage*2 {
pageItems := agg.Items[agg.ItemsPerPage:]
agg.pages++
log.Debugf("saving items to page%d.html", agg.pages)
agg.log.Debugf("saving items to page%d.html", agg.pages)
pageFile := fmt.Sprintf(agg.Directory+"/page%d.html", agg.pages)
if err := savePageToFile(pageFile, pageItems, agg.Feeds, agg.pages-1); err != nil {
log.Errorf("error saving page %s : %s", pageFile, err)
agg.log.Errorf("error saving page %s : %s", pageFile, err)
continue
}
agg.Items = agg.Items[:agg.ItemsPerPage]
}
// User might have updated feeds in index.html, so we must read it again to prevent overwriting
_, feedsToSave, err := loadFromFile(indexFile)
if err != nil {
log.Errorf("error reading feeds before writing to %s: %s", indexFile, err)
agg.log.Errorf("error reading feeds before writing to %s: %s", indexFile, err)
feedsToSave = agg.Feeds
}
if err := savePageToFile(indexFile, agg.Items, feedsToSave, agg.pages); err != nil {
log.Errorf("error saving page %s : %s", indexFile, err)
agg.log.Errorf("error saving page %s : %s", indexFile, err)
continue
}
}
Expand Down
Loading

0 comments on commit 91dfdd7

Please sign in to comment.