diff --git a/augur/housekeeper.py b/augur/housekeeper.py index 7eba9000ee..b917da9812 100644 --- a/augur/housekeeper.py +++ b/augur/housekeeper.py @@ -179,21 +179,14 @@ def prep_jobs(self): ) if 'repo_group_id' in job and job['repo_group_id'] != 0 else '{} repo.repo_id IN ({})'.format( where_and, ",".join(str(id) for id in job['repo_ids'])) if 'repo_ids' in job else '' repo_url_sql = s.sql.text(""" - SELECT repo.repo_id, repo.repo_git, pull_request_count, collected_pr_count, - (repo_info.pull_request_count - pr_count.collected_pr_count) AS pull_requests_missing - FROM augur_data.repo LEFT OUTER JOIN ( - SELECT count(*) AS collected_pr_count, repo_id - FROM pull_requests GROUP BY repo_id ) pr_count - ON pr_count.repo_id = repo.repo_id LEFT OUTER JOIN ( - SELECT repo_id, MAX ( data_collection_date ) AS last_collected - FROM augur_data.repo_info - GROUP BY repo_id) recent_info - ON recent_info.repo_id = pr_count.repo_id LEFT OUTER JOIN repo_info - ON recent_info.repo_id = repo_info.repo_id - AND repo_info.data_collection_date = recent_info.last_collected + SELECT yy.repo_id,repo_git,pull_request_count,collected_pr_count,pull_requests_missing FROM ( + SELECT repo_info.repo_id,repo.repo_git,MAX (pull_request_count) AS pull_request_count FROM repo_info,repo-- WHERE issues_enabled = 'true' + WHERE pull_request_count>=1 AND repo.repo_id=repo_info.repo_id GROUP BY repo_info.repo_id,repo.repo_git ORDER BY repo_info.repo_id,repo.repo_git) yy LEFT OUTER JOIN ( + SELECT A.repo_id,COUNT (*) AS collected_pr_count,(b.pull_request_count-COUNT (*)) AS pull_requests_missing FROM augur_data.repo A,augur_data.pull_requests d,augur_data.repo_info b,( + SELECT repo_id,MAX (data_collection_date) AS last_collected FROM augur_data.repo_info GROUP BY repo_id ORDER BY repo_id) e,( + SELECT repo_id,MAX (data_collection_date) AS last_pr_collected FROM augur_data.pull_requests GROUP BY repo_id ORDER BY repo_id) f WHERE A.repo_id=b.repo_id AND LOWER (A.repo_git) LIKE '%github.com%' AND A.repo_id=d.repo_id AND b.repo_id=d.repo_id AND e.repo_id=A.repo_id AND b.data_collection_date=e.last_collected AND f.repo_id=A.repo_id {} - GROUP BY repo.repo_id, repo_info.pull_request_count, pr_count.collected_pr_count - ORDER BY pull_requests_missing DESC NULLS LAST + GROUP BY A.repo_id,d.repo_id,b.pull_request_count,e.last_collected,f.last_pr_collected ORDER BY A.repo_id DESC) zz ON yy.repo_id=zz.repo_id ORDER BY pull_requests_missing DESC NULLS FIRST """.format(where_condition)) if job['model'] == 'pull_requests' else s.sql.text(""" SELECT * diff --git a/metadata.py b/metadata.py index fccf14a165..222670f4da 100644 --- a/metadata.py +++ b/metadata.py @@ -5,8 +5,8 @@ __short_description__ = "Python 3 package for free/libre and open-source software community metrics, models & data collection" -__version__ = "0.23.7.2" -__release__ = "v0.23.7.2" +__version__ = "0.23.8" +__release__ = "v0.23.8" __license__ = "MIT" __copyright__ = "University of Missouri, University of Nebraska-Omaha, CHAOSS & Augurlabs 2022" diff --git a/setup.py b/setup.py index 666d9b2f9b..d70d3ef667 100644 --- a/setup.py +++ b/setup.py @@ -42,11 +42,11 @@ "Flask==2.0.2", "Flask-Cors==3.0.10", "Flask-Login==0.5.0", - "Flask-WTF==0.15.1", - "pandas==1.3.2", + "Flask-WTF==1.0.0", + "pandas==1.3.5", "numpy==1.21", - "requests==2.22.0", - "psycopg2-binary==2.8.6", + "requests==2.27.1", + "psycopg2-binary==2.9.3", "click==8.0.3", "psutil==5.8.0", "gunicorn==20.1.0", @@ -60,7 +60,7 @@ "partd >= 0.3.10", "distributed >= 2021.03.0", "nltk==3.6.6", - "h5py~=3.1.0", + "h5py~=3.6.0", "scipy==1.7.3", "blinker==1.4", "protobuf > 3.6.0", diff --git a/workers/clustering_worker/setup.py b/workers/clustering_worker/setup.py index d25105085f..2eb1979513 100644 --- a/workers/clustering_worker/setup.py +++ b/workers/clustering_worker/setup.py @@ -24,15 +24,15 @@ def read(filename): 'Flask==2.0.2', 'Flask-Cors==3.0.10', 'Flask-Login==0.5.0', - 'Flask-WTF==0.15.1', - 'requests==2.22.0', - 'psycopg2-binary==2.8.6', + 'Flask-WTF==1.0.0', + 'requests==2.27.1', + 'psycopg2-binary==2.9.3', 'sklearn==0.0', 'numpy==1.21.0', 'nltk==3.6.6', 'seaborn==0.11.1', - 'pandas==1.3.2', - 'matplotlib==3.3.4' + 'pandas==1.3.5', + 'matplotlib==3.5.1' ], entry_points={ 'console_scripts': [ diff --git a/workers/contributor_breadth_worker/setup.py b/workers/contributor_breadth_worker/setup.py index c2c1555102..15aff35abf 100644 --- a/workers/contributor_breadth_worker/setup.py +++ b/workers/contributor_breadth_worker/setup.py @@ -25,9 +25,9 @@ def read(filename): 'Flask==2.0.2', 'Flask-Cors==3.0.10', 'Flask-Login==0.5.0', - 'Flask-WTF==0.15.1', - 'requests==2.22.0', - 'psycopg2-binary==2.8.6' + 'Flask-WTF==1.0.0', + 'requests==2.27.1', + 'psycopg2-binary==2.9.3' ], entry_points={ 'console_scripts': [ diff --git a/workers/contributor_worker/setup.py b/workers/contributor_worker/setup.py index 9552c4ba1b..91205699c1 100644 --- a/workers/contributor_worker/setup.py +++ b/workers/contributor_worker/setup.py @@ -25,9 +25,9 @@ def read(filename): 'Flask==2.0.2', 'Flask-Cors==3.0.10', 'Flask-Login==0.5.0', - 'Flask-WTF==0.15.1', - 'requests==2.22.0', - 'psycopg2-binary==2.8.6', + 'Flask-WTF==1.0.0', + 'requests==2.27.1', + 'psycopg2-binary==2.9.3', 'click==8.0.3', 'scipy==1.7.3', 'sklearn==0.0' diff --git a/workers/deps_libyear_worker/setup.py b/workers/deps_libyear_worker/setup.py index 43b116b5cf..5a6a7a411f 100644 --- a/workers/deps_libyear_worker/setup.py +++ b/workers/deps_libyear_worker/setup.py @@ -25,9 +25,9 @@ def read(filename): 'Flask==2.0.2', 'Flask-Cors==3.0.10', 'Flask-Login==0.5.0', - 'Flask-WTF==0.15.1', - 'requests==2.22.0', - 'psycopg2-binary==2.8.6', + 'Flask-WTF==1.0.0', + 'requests==2.27.1', + 'psycopg2-binary==2.9.3', 'toml', 'pyYaml' ], diff --git a/workers/deps_worker/setup.py b/workers/deps_worker/setup.py index 2c1ee4b881..00965f380f 100644 --- a/workers/deps_worker/setup.py +++ b/workers/deps_worker/setup.py @@ -25,9 +25,9 @@ def read(filename): 'Flask==2.0.2', 'Flask-Cors==3.0.10', 'Flask-Login==0.5.0', - 'Flask-WTF==0.15.1', - 'requests==2.22.0', - 'psycopg2-binary==2.8.6' + 'Flask-WTF==1.0.0', + 'requests==2.27.1', + 'psycopg2-binary==2.9.3' ], entry_points={ 'console_scripts': [ diff --git a/workers/discourse_analysis_worker/setup.py b/workers/discourse_analysis_worker/setup.py index 5882636cf5..f50f785edc 100644 --- a/workers/discourse_analysis_worker/setup.py +++ b/workers/discourse_analysis_worker/setup.py @@ -24,16 +24,15 @@ def read(filename): 'Flask==2.0.2', 'Flask-Cors==3.0.10', 'Flask-Login==0.5.0', - 'Flask-WTF==0.15.1', - 'requests==2.22.0', - 'psycopg2-binary==2.8.6', + 'Flask-WTF==1.0.0', + 'requests==2.27.1', + 'psycopg2-binary==2.9.3', 'click==8.0.3', 'scipy==1.7.3', 'nltk==3.6.6', - 'pandas==1.3.2', - 'scikit-learn==0.24.1', - 'textblob==0.15.3', - 'sklearn-crfsuite==0.3.6' + 'pandas==1.3.5', + 'scikit-learn==1.0.2', + 'textblob==0.15.3' ], entry_points={ 'console_scripts': [ diff --git a/workers/facade_worker/setup.py b/workers/facade_worker/setup.py index 84da25c2cf..ac917fc09e 100644 --- a/workers/facade_worker/setup.py +++ b/workers/facade_worker/setup.py @@ -25,9 +25,9 @@ def read(filename): 'Flask==2.0.2', 'Flask-Cors==3.0.10', 'Flask-Login==0.5.0', - 'Flask-WTF==0.15.1', - 'requests==2.22.0', - 'psycopg2-binary==2.8.6', + 'Flask-WTF==1.0.0', + 'requests==2.27.1', + 'psycopg2-binary==2.9.3', 'click==8.0.3', 'XlsxWriter==1.3.7' ], diff --git a/workers/github_worker/setup.py b/workers/github_worker/setup.py index 622f7f28fd..577301f936 100644 --- a/workers/github_worker/setup.py +++ b/workers/github_worker/setup.py @@ -25,9 +25,9 @@ def read(filename): 'Flask==2.0.2', 'Flask-Cors==3.0.10', 'Flask-Login==0.5.0', - 'Flask-WTF==0.15.1', - 'requests==2.22.0', - 'psycopg2-binary==2.8.6', + 'Flask-WTF==1.0.0', + 'requests==2.27.1', + 'psycopg2-binary==2.9.3', 'click==8.0.3' ], entry_points={ diff --git a/workers/gitlab_issues_worker/setup.py b/workers/gitlab_issues_worker/setup.py index 5f58f9bb46..5cd9000257 100644 --- a/workers/gitlab_issues_worker/setup.py +++ b/workers/gitlab_issues_worker/setup.py @@ -25,9 +25,9 @@ def read(filename): 'Flask==2.0.2', 'Flask-Cors==3.0.10', 'Flask-Login==0.5.0', - 'Flask-WTF==0.15.1', - 'requests==2.22.0', - 'psycopg2-binary==2.8.6', + 'Flask-WTF==1.0.0', + 'requests==2.27.1', + 'psycopg2-binary==2.9.3', 'click==8.0.3' ], entry_points={ diff --git a/workers/gitlab_merge_request_worker/setup.py b/workers/gitlab_merge_request_worker/setup.py index c67aa49990..e1cb2d64df 100644 --- a/workers/gitlab_merge_request_worker/setup.py +++ b/workers/gitlab_merge_request_worker/setup.py @@ -24,9 +24,9 @@ def read(filename): 'Flask==2.0.2', 'Flask-Cors==3.0.10', 'Flask-Login==0.5.0', - 'Flask-WTF==0.15.1', - 'requests==2.22.0', - 'psycopg2-binary==2.8.6', + 'Flask-WTF==1.0.0', + 'requests==2.27.1', + 'psycopg2-binary==2.9.3', 'click==8.0.3' ], entry_points={ diff --git a/workers/insight_worker/setup.py b/workers/insight_worker/setup.py index 5b7853e811..155ea761e5 100644 --- a/workers/insight_worker/setup.py +++ b/workers/insight_worker/setup.py @@ -25,9 +25,9 @@ def read(filename): 'Flask==2.0.2', 'Flask-Cors==3.0.10', 'Flask-Login==0.5.0', - 'Flask-WTF==0.15.1', - 'requests==2.22.0', - 'psycopg2-binary==2.8.6', + 'Flask-WTF==1.0.0', + 'requests==2.27.1', + 'psycopg2-binary==2.9.3', 'click==8.0.3', 'scipy>=1.7.3', 'sklearn==0.0', diff --git a/workers/linux_badge_worker/setup.py b/workers/linux_badge_worker/setup.py index 3813e6ee3c..f3f599cb29 100644 --- a/workers/linux_badge_worker/setup.py +++ b/workers/linux_badge_worker/setup.py @@ -25,9 +25,9 @@ def read(filename): 'Flask==2.0.2', 'Flask-Cors==3.0.10', 'Flask-Login==0.5.0', - 'Flask-WTF==0.15.1', - 'requests==2.22.0', - 'psycopg2-binary==2.8.6', + 'Flask-WTF==1.0.0', + 'requests==2.27.1', + 'psycopg2-binary==2.9.3', 'click==8.0.3' ], entry_points={ diff --git a/workers/message_insights_worker/message_insights_worker.py b/workers/message_insights_worker/message_insights_worker.py index cd79f39015..0c248bdebf 100644 --- a/workers/message_insights_worker/message_insights_worker.py +++ b/workers/message_insights_worker/message_insights_worker.py @@ -50,7 +50,7 @@ def __init__(self, config={}): # Define data collection info self.tool_source = 'Message Insights Worker' - self.tool_version = '0.0.2' + self.tool_version = '0.2.0' self.data_source = 'Non-existent API' self.insight_days = self.config['insight_days'] diff --git a/workers/message_insights_worker/setup.py b/workers/message_insights_worker/setup.py index 25a97653b8..9d159833f8 100644 --- a/workers/message_insights_worker/setup.py +++ b/workers/message_insights_worker/setup.py @@ -13,7 +13,7 @@ def read(filename): setup( name="message_insights_worker", - version="0.1.2", + version="0.2.0", url="https://github.com/chaoss/augur", license='MIT', author="Augur Team", @@ -24,22 +24,22 @@ def read(filename): 'Flask==2.0.2', 'Flask-Cors==3.0.10', 'Flask-Login==0.5.0', - 'Flask-WTF==0.15.1', - 'requests==2.22.0', - 'psycopg2-binary==2.8.6', + 'Flask-WTF==1.0.0', + 'requests==2.27.1', + 'psycopg2-binary==2.9.3', 'click==8.0.3', 'scipy==1.7.3', 'sklearn==0.0', 'numpy==1.21.0', 'nltk==3.6.6', - 'pandas==1.3.2', - 'gensim==3.8.3', + 'pandas==1.3.5', + 'gensim==4.1.2', 'emoji==1.2.0', - 'Keras<2.8.0', + 'Keras>=2.8.0rc0', 'Keras-Preprocessing==1.1.2', - 'tensorflow==2.7.0', - 'h5py~=3.1.0', - 'scikit-image==0.18.1', + 'tensorflow==2.8.0rc0', + 'h5py~=3.6.0', + 'scikit-image==0.19.1', 'joblib==1.0.1', 'xgboost', 'bs4==0.0.1', diff --git a/workers/pull_request_analysis_worker/setup.py b/workers/pull_request_analysis_worker/setup.py index 2cfaf59a2d..8e6cda27df 100644 --- a/workers/pull_request_analysis_worker/setup.py +++ b/workers/pull_request_analysis_worker/setup.py @@ -24,14 +24,14 @@ def read(filename): 'Flask==2.0.2', 'Flask-Cors==3.0.10', 'Flask-Login==0.5.0', - 'Flask-WTF==0.15.1', - 'requests==2.22.0', - 'psycopg2-binary==2.8.6', + 'Flask-WTF==1.0.0', + 'requests==2.27.1', + 'psycopg2-binary==2.9.3', 'sklearn==0.0', 'nltk==3.6.6', 'numpy==1.21.0', - 'pandas==1.3.2', - 'gensim==3.8.3', + 'pandas==1.3.5', + 'gensim==4.1.2', 'emoji==1.2.0', 'joblib==1.0.1', 'xgboost==1.4.2', diff --git a/workers/pull_request_worker/pull_request_worker.py b/workers/pull_request_worker/pull_request_worker.py index a7c70e3e71..8a1dbf4929 100644 --- a/workers/pull_request_worker/pull_request_worker.py +++ b/workers/pull_request_worker/pull_request_worker.py @@ -49,7 +49,7 @@ def __init__(self, config={}): # Define data collection info self.tool_source = 'GitHub Pull Request Worker' - self.tool_version = '1.0.0' + self.tool_version = '1.2.0' self.data_source = 'GitHub API' #Needs to be an attribute of the class for incremental database insert using paginate_endpoint @@ -383,7 +383,7 @@ def _get_pk_source_prs(self): #self.owner and self.repo are both defined in the worker base's collect method using the url of the github repo. pr_url = ( f"https://api.github.com/repos/{self.owner}/{self.repo}/pulls?state=all&" - "direction=asc&per_page=100&page={}" + "direction=desc&per_page=100&page={}" ) #Database action map is essential in order to avoid duplicates messing up the data @@ -437,7 +437,7 @@ def pk_source_increment_insert(inc_source_prs, action_map): { 'repo_id': self.repo_id, 'pr_url': pr['url'], - 'pr_src_id': pr['id'], + 'pr_src_id': int(str(pr['id']).encode(encoding='UTF-8').decode(encoding='UTF-8')),#1-22-2022 inconsistent casting; sometimes int, sometimes float in bulk_insert 'pr_src_node_id': pr['node_id'], ## 9/20/2021 - This was null. No idea why. 'pr_html_url': pr['html_url'], 'pr_diff_url': pr['diff_url'], @@ -459,10 +459,10 @@ def pk_source_increment_insert(inc_source_prs, action_map): ) else None, 'pr_created_at': pr['created_at'], 'pr_updated_at': pr['updated_at'], - 'pr_closed_at': s.sql.expression.null() if not ( # This had to be changed because "None" is JSON. SQL requires NULL SPG 11/28/2021 + 'pr_closed_at': None if not ( pr['closed_at'] ) else pr['closed_at'], - 'pr_merged_at': None if not ( # This had to be changed because "None" is JSON. SQL requires NULL + 'pr_merged_at': None if not ( pr['merged_at'] ) else pr['merged_at'], 'pr_merge_commit_sha': pr['merge_commit_sha'], @@ -523,7 +523,8 @@ def bulk_insert( self.bulk_insert( self.pull_requests_table, update=inc_source_prs['update'], unique_columns=action_map['insert']['augur'], - insert=prs_insert, update_columns=['pr_src_state', 'pr_closed_at', 'pr_updated_at', 'pr_merged_at'] + insert=prs_insert, update_columns=['pr_src_state', 'pr_closed_at', 'pr_updated_at', 'pr_merged_at'], + convert_float_int=True ) source_data = inc_source_prs['insert'] + inc_source_prs['update'] @@ -1133,4 +1134,4 @@ def query_pr_repo(self, pr_repo, pr_repo_type, pr_meta_id): self.logger.debug(f"repo exception registerred for PRs: {e}") self.logger.debug(f"Nested Model error at loop {pr_nested_loop} : {e}.") stacker = traceback.format_exc() - self.logger.debug(f"{stacker}") \ No newline at end of file + self.logger.debug(f"{stacker}") diff --git a/workers/pull_request_worker/setup.py b/workers/pull_request_worker/setup.py index 355bdeed72..19a8bfd54a 100644 --- a/workers/pull_request_worker/setup.py +++ b/workers/pull_request_worker/setup.py @@ -14,7 +14,7 @@ def read(filename): setup( name="pull_request_worker", - version="1.1.0", + version="1.2.0", url="https://github.com/chaoss/augur", license='MIT', author="Augurlabs", @@ -25,9 +25,9 @@ def read(filename): 'Flask==2.0.2', 'Flask-Cors==3.0.10', 'Flask-Login==0.5.0', - 'Flask-WTF==0.15.1', - 'requests==2.22.0', - 'psycopg2-binary==2.8.6', + 'Flask-WTF==1.0.0', + 'requests==2.27.1', + 'psycopg2-binary==2.9.3', 'click==8.0.3' ], entry_points={ diff --git a/workers/release_worker/setup.py b/workers/release_worker/setup.py index fa0ff6694d..948bcca526 100644 --- a/workers/release_worker/setup.py +++ b/workers/release_worker/setup.py @@ -25,9 +25,9 @@ def read(filename): 'Flask==2.0.2', 'Flask-Cors==3.0.10', 'Flask-Login==0.5.0', - 'Flask-WTF==0.15.1', - 'requests==2.22.0', - 'psycopg2-binary==2.8.6', + 'Flask-WTF==1.0.0', + 'requests==2.27.1', + 'psycopg2-binary==2.9.3', 'click==8.0.3' ], entry_points={ diff --git a/workers/repo_info_worker/setup.py b/workers/repo_info_worker/setup.py index 7b39f08a4a..14f67fce04 100644 --- a/workers/repo_info_worker/setup.py +++ b/workers/repo_info_worker/setup.py @@ -25,9 +25,9 @@ def read(filename): 'Flask==2.0.2', 'Flask-Cors==3.0.10', 'Flask-Login==0.5.0', - 'Flask-WTF==0.15.1', - 'requests==2.22.0', - 'psycopg2-binary==2.8.6' + 'Flask-WTF==1.0.0', + 'requests==2.27.1', + 'psycopg2-binary==2.9.3' ], entry_points={ 'console_scripts': [ diff --git a/workers/value_worker/setup.py b/workers/value_worker/setup.py index ec6530b625..44d84d7573 100644 --- a/workers/value_worker/setup.py +++ b/workers/value_worker/setup.py @@ -19,9 +19,9 @@ 'Flask==2.0.2', 'Flask-Cors==3.0.10', 'Flask-Login==0.5.0', - 'Flask-WTF==0.15.1', - 'requests==2.22.0', - 'psycopg2-binary==2.8.6', + 'Flask-WTF==1.0.0', + 'requests==2.27.1', + 'psycopg2-binary==2.9.3', 'click==8.0.3' ], entry_points={ diff --git a/workers/worker_git_integration.py b/workers/worker_git_integration.py index ed2851ea0c..8f6b459b47 100644 --- a/workers/worker_git_integration.py +++ b/workers/worker_git_integration.py @@ -1391,7 +1391,7 @@ def load_url(url, extra_data={}): #insertion_method and stagger are arguments that allow paginate_endpoint to insert at around ~500 pages at a time. def paginate_endpoint( - self, url, action_map={}, table=None, where_clause=True, platform='github', in_memory=True, stagger=False, insertion_method=None, insertion_threshold=500 + self, url, action_map={}, table=None, where_clause=True, platform='github', in_memory=True, stagger=False, insertion_method=None, insertion_threshold=1000 ): #Get augur columns using the action map along with the primary key diff --git a/workers/worker_persistance.py b/workers/worker_persistance.py index f4a9700d62..5f42d20310 100644 --- a/workers/worker_persistance.py +++ b/workers/worker_persistance.py @@ -811,10 +811,13 @@ def psql_insert_copy(table, conn, keys, data_iter): else: table_name = table.name - sql = 'COPY {} ({}) FROM STDIN WITH CSV'.format( - table_name, columns) + sql = 'COPY {} ({}) FROM STDIN WITH (FORMAT CSV, encoding "UTF-8")'.format( + table_name, columns) + + #(FORMAT CSV, FORCE_NULL(column_name)) self.logger.debug(f'table name is: {table_name}, and columns are {columns}.') + self.logger.debug(f'sql is: {sql}') #This causes the github worker to throw an error with pandas #cur.copy_expert(sql=sql, file=self.text_clean(s_buf))