WittmannF · WittmannF · Sep 10, 2022 · Sep 9, 2022
diff --git a/sortgs.py b/sortgs.py
@@ -202,6 +202,8 @@ def main():
     citations = []
     year = []
     author = []
+    venue = []
+    publisher = []
     rank = [0]
 
     # Get content from number_of_results URLs
@@ -225,10 +227,10 @@ def main():
                 print(e)
 
         # Create parser
-        soup = BeautifulSoup(c, 'html.parser')
+        soup = BeautifulSoup(c, 'html.parser', from_encoding='utf-8')
 
         # Get stuff
-        mydivs = soup.findAll("div", { "class" : "gs_r" })
+        mydivs = soup.findAll("div", { "class" : "gs_or" })
 
         for div in mydivs:
             try:
@@ -258,14 +260,24 @@ def main():
             except:
                 author.append("Author not found")
 
+            try:
+                publisher.append(div.find('div',{'class' : 'gs_a'}).text.split("-")[-1])
+            except:
+                publisher.append("Publisher not found")
+
+            try:
+                venue.append(" ".join(div.find('div',{'class' : 'gs_a'}).text.split("-")[-2].split(",")[:-1]))
+            except:
+                venue.append("Venue not fount")
+
             rank.append(rank[-1]+1)
 
         # Delay 
         sleep(0.5)
 
     # Create a dataset and sort by the number of citations
-    data = pd.DataFrame(list(zip(author, title, citations, year, links)), index = rank[1:],
-                        columns=['Author', 'Title', 'Citations', 'Year', 'Source'])
+    data = pd.DataFrame(list(zip(author, title, citations, year, publisher, venue, links)), index = rank[1:],
+                        columns=['Author', 'Title', 'Citations', 'Year', 'Publisher', 'Venue', 'Source'])
     data.index.name = 'Rank'
 
     # Add columns with number of citations per year

diff --git a/test/test_sortgs.py b/test/test_sortgs.py
@@ -18,10 +18,10 @@ def setUpClass(self):
         self.df_top_sorted_cit_per_year=pd.read_csv('machine_learning.csv')
 
     def test_get_10_results(self):
-        self.assertEqual(len(self.df_top_10), 12) # Two extra unwanted elements were captured
+        self.assertEqual(len(self.df_top_10), 10)
 
     def test_get_20_results(self):
-        self.assertEqual(len(self.df_top_20), 23) # Three extra unwanted elements were captured
+        self.assertEqual(len(self.df_top_20), 20)
 
     def test_is_sorted(self):
         df=self.df_top_20
@@ -34,15 +34,15 @@ def test_top_result(self):
         top_citation = int(df.Citations.values[0])
         top_cit_per_year = int(df['cit/year'].values[0])
         top_results = [top_author, top_citation, top_cit_per_year]
-        self.assertEqual(top_results, [' Bishop', 49230, 3077])
+        self.assertEqual(top_results, [' Bishop', 49230, 2896])
 
     def test_cit_per_year_sorted(self):
         df=self.df_top_sorted_cit_per_year
         top_citations=list(df.Citations.values[:5])
         top_cit_per_year = list(df['cit/year'].values[:5])
         top_results = [top_citations, top_cit_per_year]
         self.assertEqual(top_results, [[49230, 8603, 2853, 3166, 2416],
-                                        [3077,  860,  713,  396,  345]])
+                                        [2896, 782, 571, 352, 302]])
 
     def test_csv_exists(self):
         os.system("python sortgs.py --debug --kw 'machine learning' --nresults 10")