Skip to content

Commit

Permalink
Add buglocator dataset scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
mfejzer committed Jul 28, 2019
1 parent 62453c6 commit f3a1887
Show file tree
Hide file tree
Showing 8 changed files with 1,373 additions and 0 deletions.
136 changes: 136 additions & 0 deletions calculate_buglocator_feature_3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""Usage: %(scriptName) <bug_reports.json> <data_prefix>
"""

import json
from timeit import default_timer

import datetime
import sys
from collections import Counter
from nltk import wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from re import finditer
from scipy import sparse
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from tqdm import tqdm

def main():
print("Start", datetime.datetime.now().isoformat())
before = default_timer()

bug_report_file_path = sys.argv[1]
print("bug report file path", bug_report_file_path)
data_prefix = sys.argv[2]
print("data prefix", data_prefix)

bug_reports = load_bug_reports(bug_report_file_path)

process(data_prefix, bug_reports)

after = default_timer()
total = after - before
print("End", datetime.datetime.now().isoformat())
print("total time", total)


def load_bug_reports(bug_report_file_path):
"""load bug report file (the one generated from xml)"""
with open(bug_report_file_path) as bug_report_file:
bug_reports = json.load(bug_report_file)
return bug_reports


def process(data_prefix, bug_reports):
stemmer = PorterStemmer()

feature_3_data_list = []
feature_3_report_lookup = {}

current_index = 0
for bug_report_id in tqdm(bug_reports):
current_bug_report = bug_reports[bug_report_id]['bug_report']
open_timestamp = current_bug_report['open_timestamp']
fixed_files = current_bug_report['result']

lookup = {}

report = str(current_bug_report['summary']) + str(current_bug_report['description'])
report_tokens = tokenize(report, stemmer)
feature_3_data_list.append(report_tokens)
lookup['report'] = current_index
current_index += 1

file_looup = {}
for fixed_file in fixed_files:
combined_reports = combine_reports_fixing_same_file_before_date(bug_reports, fixed_file, open_timestamp)
combined_report_tokens = tokenize(combined_reports, stemmer)
feature_3_data_list.append(combined_report_tokens)
file_looup[fixed_file] = current_index
current_index += 1
lookup['files'] = file_looup
feature_3_report_lookup[bug_report_id] = lookup

before_v = default_timer()
vectorizer = DictVectorizer()
vectorized_data = vectorizer.fit_transform(feature_3_data_list)
after_v = default_timer()
total_v = after_v - before_v
print("total count vectorization time ", total_v)
print("vectorized_data type ", type(vectorized_data))
print("vectorized_data shape", vectorized_data.shape)
sparse.save_npz(data_prefix+'_raw_count_data_before_tf_idf', vectorized_data)

before_tf_idf = default_timer()
transformer = TfidfTransformer()
tf_idf_data = transformer.fit_transform(vectorized_data)
after_tf_idf = default_timer()
total_tf_idf = after_tf_idf - before_tf_idf
print("total count tf idf time ", total_tf_idf)
print("tf_idf_data type ", type(tf_idf_data))
print("tf_idf_data shape", tf_idf_data.shape)

sparse.save_npz(data_prefix + '_feature_3_data', tf_idf_data)
with open(data_prefix + '_feature_3_report_lookup', 'w') as outfile:
json.dump(feature_3_report_lookup, outfile)


def combine_reports_fixing_same_file_before_date(bug_reports, selected_file, open_timestamp):
summaries = []
for bug_report_id in bug_reports:
current_bug_report = bug_reports[bug_report_id]['bug_report']
if selected_file in current_bug_report['result'] and current_bug_report['timestamp'] < open_timestamp:
summaries.append(str(current_bug_report['summary']))
# summaries.append(str(current_bug_report['summary'] + str(current_bug_report['description']))

return ' '.join(summaries)


def camel_case_split(identifier):
matches = finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier)
return [m.group(0) for m in matches]

removed = u'!"#%&\'()*+,-./:;<=>?@[\]^_`{|}~1234567890'
utf_translate_table = dict((ord(char), u' ') for char in removed)
stop_words = set(stopwords.words('english'))


def tokenize(text, stemmer):
sanitized_text = text.translate(utf_translate_table)
tokens = wordpunct_tokenize(sanitized_text)
all_tokens = []
for token in tokens:
additional_tokens = camel_case_split(token)
if len(additional_tokens)>1:
for additional_token in additional_tokens:
all_tokens.append(additional_token)
all_tokens.append(token)
return Counter([stemmer.stem(token) for token in all_tokens if token.lower() not in stop_words])


if __name__ == '__main__':
main()
250 changes: 250 additions & 0 deletions calculate_buglocator_features.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,250 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""Usage: %(scriptName) <bug_report_file> <data_prefix>
"""

import json
from timeit import default_timer

import datetime
import numpy as np
import sys
from multiprocessing import Pool
from operator import itemgetter
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm


def main():
print("Start", datetime.datetime.now().isoformat())
before = default_timer()

bug_report_file_path = sys.argv[1]
print("bug report file path", bug_report_file_path)
data_prefix = sys.argv[2]
print("data prefix", data_prefix)

bug_reports = load_bug_reports(bug_report_file_path)

process(bug_reports, data_prefix, bug_report_file_path)

after = default_timer()
total = after - before
print("End", datetime.datetime.now().isoformat())
print("total time", total)


def load_bug_reports(bug_report_file_path):
"""load bug report file (the one generated from xml)"""
with open(bug_report_file_path) as bug_report_file:
bug_reports = json.load(bug_report_file)
return bug_reports


def sort_bug_reports_by_id(bug_reports):
dates = []
for index, id in enumerate(tqdm(bug_reports)):
date = bug_reports[id]['bug_report']['timestamp']
dates.append((id, date))

sorted_dates = sorted(dates, key=itemgetter(1))
sorted_ids = [sorted_date[0] for sorted_date in sorted_dates]
return sorted_ids


def load_bug_report(vectorized_data, bug_report_indexes, bug_report_id):
# index_dict = pickle.loads(bug_report_indexes[bug_report_id[0:7]])
index_dict = bug_report_indexes[bug_report_id]
report_index = index_dict['report']
vectorized_report = vectorized_data[report_index, :]

summary_index = index_dict['summary']
vectorized_summary = vectorized_data[summary_index, :]

description_index = index_dict['description']
vectorized_description = vectorized_data[description_index, :]

return vectorized_report, vectorized_summary, vectorized_description


def feature_1(report, data, source_index, method_start_index, method_end_index):
sources = data[source_index:method_end_index+1, :]
similarities = cosine_similarity(report, sources)

return np.amax(similarities)


def feature_2(report, data, enriched_api_indexes, current_file_index):
file_enriched_api = enriched_api_indexes[current_file_index]
enriched_api_start = file_enriched_api['enrichedApiStart']
enriched_api_end = file_enriched_api['enrichedApiEnd']
return feature_sim(report, data, enriched_api_start, enriched_api_end)


def feature_sim(document, data, start_index, end_index):
if start_index == end_index + 1:
return 0.0
sources = data[start_index:end_index+1, :]
similarities = cosine_similarity(document, sources)

return np.amax(similarities)


def feature_3(feature_3_data, feature_3_report_lookup, file_index):
report_index = feature_3_report_lookup['report']
packages_with_directory = file_index.replace('/', '.')
for file in feature_3_report_lookup['files']:
if packages_with_directory.endswith(file):
feature_3_file_index = feature_3_report_lookup['files'][file]
report_row = feature_3_data[report_index, :]
file_row = feature_3_data[feature_3_file_index, :]
return cosine_similarity(report_row, file_row)
return 0.0


def feature_4(bug_report_summary, file_index):
class_name = file_index.split('/')[-1].split('.')[0]
if class_name in bug_report_summary:
return len(class_name)
else:
return 0.0


def process(bug_reports, data_prefix, bug_report_file_path):
sorted_ids = sort_bug_reports_by_id(bug_reports)

work = []
for bug_report_id in sorted_ids:
work.append((data_prefix, bug_report_id, bug_report_file_path))
# _f(work[0])
# exit(0)

pool = Pool(12, maxtasksperchild=1)
r = list(tqdm(pool.imap(_f, work), total=len(work)))
print("r", len(r))


def _f(args):
return process_bug_report(args[0], args[1], args[2])


def process_bug_report(data_prefix, bug_report_id, bug_report_file_path):
bug_reports = load_bug_reports(bug_report_file_path)
bug_report = bug_reports[bug_report_id]

vectorized_data = sparse.load_npz(data_prefix+'_raw_count_data.npz')
with open(data_prefix+'_feature_names_dict', 'r') as infile:
feature_names_lenghts_dict = json.load(infile)
with open(data_prefix+'_file_index_lookup', 'r') as infile:
file_index_lookup = json.load(infile)
with open(data_prefix+'_bug_report_index_lookup', 'r') as infile:
bug_report_index_lookup = json.load(infile)

enriched_api_data = sparse.load_npz(data_prefix+'_tfidf_enriched_api.npz').tocsr()
with open(data_prefix + '_partial_enriched_api_index_lookup') as infile:
enriched_api_lookup = json.load(infile)
with open(data_prefix + '_partial_enriched_api_bug_report_index_lookup') as infile:
enriched_api_bug_reports_lookup = json.load(infile)

graph_data = sparse.load_npz(data_prefix+'_graph_features_data.npz').tocsr()
with open(data_prefix + '_graph_features_index_lookup', 'r') as infile:
graph_lookup = json.load(infile)

feature_3_data = sparse.load_npz(data_prefix + '_feature_3_data.npz')
with open(data_prefix + '_feature_3_report_lookup', 'r') as infile:
feature_3_report_lookup = json.load(infile)

with open(data_prefix + '_feature_5_report_lookup', 'r') as infile:
recency_lookup = json.load(infile)
with open(data_prefix + '_feature_6_report_lookup', 'r') as infile:
frequency_lookup = json.load(infile)

(vectorized_report, vectorized_summary, vectorized_description) = load_bug_report(vectorized_data, bug_report_index_lookup, bug_report_id)

enriched_report = enriched_api_data[enriched_api_bug_reports_lookup[bug_report_id], :]

current_feature_3_report_lookup = feature_3_report_lookup[bug_report_id]

current_bug_report_summary = bug_reports[bug_report_id]['bug_report']['summary']

features = []
features_files = []

for file_index in file_index_lookup:
current_lookup = file_index_lookup[file_index]
source_index = current_lookup['source']

method_source_start_index = current_lookup['methodsStart']
method_source_end_index = current_lookup['methodsEnd']

class_start_index = current_lookup['classNamesStart']
class_end_index = current_lookup['classNamesEnd']

method_names_start_index = current_lookup['methodNamesStart']
method_names_end_index = current_lookup['methodNamesEnd']

variable_start_index = current_lookup['variableNamesStart']
variable_end_index = current_lookup['variableNamesEnd']

comment_start_index = current_lookup['commentsStart']
comment_end_index = current_lookup['commentsEnd']

current_graph_lookup = graph_lookup[file_index]

current_recency_lookup = recency_lookup[bug_report_id]
current_frequency_lookup = frequency_lookup[bug_report_id]

class_with_packages_and_directory = file_index.replace('/', '.')

f1 = feature_1(vectorized_report, vectorized_data, source_index, method_source_start_index, method_source_end_index)
f2 = feature_2(enriched_report, enriched_api_data, enriched_api_lookup, file_index)
f3 = feature_3(feature_3_data, current_feature_3_report_lookup, file_index)
f4 = feature_4(current_bug_report_summary, file_index)

f5 = 0.0
for recency_file in current_recency_lookup.keys():
if class_with_packages_and_directory.endswith(recency_file):
f5 = current_recency_lookup[recency_file]
break
f6 = 0.0
for frequency_file in current_frequency_lookup.keys():
if class_with_packages_and_directory.endswith(frequency_file):
f6 = current_recency_lookup[frequency_file]
break

f7 = feature_sim(vectorized_summary, vectorized_data, class_start_index, class_end_index)
f8 = feature_sim(vectorized_summary, vectorized_data, method_names_start_index, method_names_end_index)
f9 = feature_sim(vectorized_summary, vectorized_data, variable_start_index, variable_end_index)
f10 = feature_sim(vectorized_summary, vectorized_data, comment_start_index, comment_end_index)

f11 = feature_sim(vectorized_description, vectorized_data, class_start_index, class_end_index)
f12 = feature_sim(vectorized_description, vectorized_data, method_names_start_index, method_names_end_index)
f13 = feature_sim(vectorized_description, vectorized_data, variable_start_index, variable_end_index)
f14 = feature_sim(vectorized_description, vectorized_data, comment_start_index, comment_end_index)

f15 = graph_data[current_graph_lookup, 0]
f16 = graph_data[current_graph_lookup, 1]
f17 = graph_data[current_graph_lookup, 2]
f18 = graph_data[current_graph_lookup, 3]
f19 = graph_data[current_graph_lookup, 4]

used_in_fix = 0.0
for result in bug_report['bug_report']['result']:
if class_with_packages_and_directory.endswith(result):
used_in_fix = 1.0
break

features.append([f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15, f16, f17, f18, f19, used_in_fix])
features_files.append(file_index)

sparse_features = sparse.csr_matrix(features)
sparse.save_npz(data_prefix+'_'+bug_report_id+'_features', sparse_features)
with open(data_prefix+'_'+bug_report_id+'_files', 'w') as outfile:
json.dump(features_files, outfile)


if __name__ == '__main__':
main()
Loading

0 comments on commit f3a1887

Please sign in to comment.