diff --git a/README.md b/README.md index b7979692..4d927287 100644 --- a/README.md +++ b/README.md @@ -57,65 +57,65 @@ Follow these steps to set up the environment and run the application. 2. Clone the forked repository. - ```bash - git clone https://github.com//Resume-Matcher.git - cd Resume-Matcher - ``` + ```bash + git clone https://github.com//Resume-Matcher.git + cd Resume-Matcher + ``` 3. Create a Python Virtual Environment: - - Using [virtualenv](https://learnpython.com/blog/how-to-use-virtualenv-python/): + - Using [virtualenv](https://learnpython.com/blog/how-to-use-virtualenv-python/): - _Note_: Check how to install virtualenv on your system here [link](https://learnpython.com/blog/how-to-use-virtualenv-python/). + _Note_: Check how to install virtualenv on your system here [link](https://learnpython.com/blog/how-to-use-virtualenv-python/). - ```bash - virtualenv env - ``` + ```bash + virtualenv env + ``` - **OR** + **OR** - - Create a Python Virtual Environment: + - Create a Python Virtual Environment: - ```bash - python -m venv env - ``` + ```bash + python -m venv env + ``` 4. Activate the Virtual Environment. - - On Windows. + - On Windows. - ```bash - env\Scripts\activate - ``` + ```bash + env\Scripts\activate + ``` - - On macOS and Linux. + - On macOS and Linux. - ```bash - source env/bin/activate - ``` + ```bash + source env/bin/activate + ``` 5. Install Dependencies: - ```bash - pip install -r requirements.txt - ``` + ```bash + pip install -r requirements.txt + ``` 6. Prepare Data: - - Resumes: Place your resumes in PDF format in the `Data/Resumes` folder. Remove any existing contents in this folder. - - Job Descriptions: Place your job descriptions in PDF format in the `Data/JobDescription` folder. Remove any existing contents in this folder. + - Resumes: Place your resumes in PDF format in the `Data/Resumes` folder. Remove any existing contents in this folder. + - Job Descriptions: Place your job descriptions in PDF format in the `Data/JobDescription` folder. Remove any existing contents in this folder. 7. Parse Resumes to JSON: - ```python - python run_first.py - ``` + ```python + python run_first.py + ``` 8. Run the Application: - ```python - streamlit run streamlit_app.py - ``` + ```python + streamlit run streamlit_app.py + ``` **Note**: For local versions, you do not need to run "streamlit_second.py" as it is specifically for deploying to Streamlit servers. @@ -127,12 +127,29 @@ Follow these steps to set up the environment and run the application. 1. Build the image and start application - ```bash - docker-compose up - ``` + ```bash + docker-compose up + ``` 2. Open `localhost:80` on your browser +### Cohere and Qdrant + +1. Visit [Cohere website registration](https://dashboard.cohere.ai/welcome/register) and create an account. +2. Go to API keys and copy your cohere api key. +3. Visit [Qdrant website](https://cloud.qdrant.io/) and create an account. +4. Get your api key and cluster url as well +5. Now create a yaml file named config.yml in Scripts/Similarity/ folder. +6. The format for the conifg file should be as below: + ```yaml + cohere: + api_key: cohere_key + qdrant: + api_key: qdrant_api_key + url: qdrant_cluster_url + ``` +7. Please replace your values without any quotes. +
diff --git a/archive/resume_matcher.ipynb b/archive/resume_matcher.ipynb index 39fac1a4..0c5d50bc 100644 --- a/archive/resume_matcher.ipynb +++ b/archive/resume_matcher.ipynb @@ -17,32 +17,11 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "aHoRFk4LpFSZ", - "outputId": "0a950106-ea2a-498a-9dcc-e99458b1f139" + "id": "aHoRFk4LpFSZ" }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m44.5/44.5 kB\u001b[0m \u001b[31m1.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.7/2.7 MB\u001b[0m \u001b[31m30.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m132.5/132.5 kB\u001b[0m \u001b[31m1.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.6/2.6 MB\u001b[0m \u001b[31m11.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.4/75.4 kB\u001b[0m \u001b[31m6.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m304.5/304.5 kB\u001b[0m \u001b[31m12.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m74.5/74.5 kB\u001b[0m \u001b[31m6.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m57.5/57.5 kB\u001b[0m \u001b[31m5.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m5.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h" - ] - } - ], + "outputs": [], "source": [ "!pip install cohere --quiet\n", "!pip install qdrant-client --quiet" @@ -55,14 +34,27 @@ "from qdrant_client import QdrantClient, models\n", "from qdrant_client.http.models import Batch\n", "import cohere\n", + "\n", "def read_config(filepath):\n", - " with open(filepath) as f:\n", - " config = yaml.safe_load(f)\n", - " return config\n", + " try:\n", + " with open(filepath) as f:\n", + " config = yaml.safe_load(f)\n", + " return config\n", + " except FileNotFoundError as e:\n", + " print(f\"Configuration file {filepath} not found: {e}\")\n", + " except yaml.YAMLError as e:\n", + " print(f\"Error parsing YAML in configuration file {filepath}: {e}\", exc_info=True)\n", + " except Exception as e:\n", + " print(f\"Error reading configuration file {filepath}: {e}\")\n", + " return None\n", + "\n", "\n", "class QdrantSearch:\n", " def __init__(self, resumes, jd):\n", " config = read_config(\"config.yml\")\n", + "\n", + "\n", + "\n", " self.cohere_key = config['cohere']['api_key']\n", " self.qdrant_key = config['qdrant']['api_key']\n", " self.qdrant_url = config['qdrant']['url']\n", @@ -128,7 +120,7 @@ "metadata": { "id": "SXOgwcCATtww" }, - "execution_count": null, + "execution_count": 6, "outputs": [] }, { @@ -136,23 +128,26 @@ "source": [ "resumes = [\"Professional Summary Highly skilled MERN Stack Developer with over 10 years of experience specializing in designing building and maintaining complex web applications Proficient in MongoDB Expressjs React and Nodejs Currently contributing to the development of AI technologies at OpenAI with a primary focus on the ChatGPT project Skills JavaScript and TypeScript MongoDB Expressjs React Nodejs MERN stack RESTful APIs Git and GitHub Docker and Kubernetes Agile and Scrum Python and Machine Learning basics Experience June 2020 PresentMERN Stack Developer OpenAI San Francisco USA Working on the development of the ChatGPT project using Nodejs Expressjs and React Implementing RESTful services for communication between frontend and backend Utilizing Docker and Kubernetes for deployment and management of applications Working in an Agile environment delivering highquality software every sprint Contributing to the design and implementation of machine learning algorithms for natural language processing tasks July 2015 May 2020Full Stack Developer Uber San Francisco USA Developed and maintained scalable web applications using MERN stack Ensured the performance quality and responsiveness of applications Successfully deployed solutions using Docker and Kubernetes Collaborated with a team of engineers product managers and UX designers Led a team of junior developers conducted code reviews and ensured adherence to best coding practices Worked closely with the data science team to optimize recommendation algorithms and enhance user experience June 2012 June 2015Software Developer Facebook Menlo Park USA Developed features for the Facebook web application using React Ensured the performance of the MongoDB databases Utilized RESTful APIs for communication between different parts of the application Worked in a fastpaced testdriven development environment Assisted in migrating the legacy system to a modern MERN stack architecture Education 2009 2012 PhD in Computer Science CalTech Pasadena USA 2007 2009 Master of Science in Computer Science MIT Cambridge USA 2003 2007 Bachelor of Science in Computer Science UC San Diego San Diego USA 1/2 Projects 2019 PresentPersonal Project Gotham Event Planner Created a fullfeatured web application to plan and organize events in Gotham city Used MERN stack for development and Docker for deployment The application allows users to create manage and share events and integrates with Google Maps API to display event locations 2/2\"]\n", "job_description = \"Job Description Java Developer 3 Years of Experience Tech Solutions San Francisco CA USA About Us At Tech Solutions we believe in the power of technology to solve complex problems We are a dynamic forwardthinking tech company specializing in custom software solutions for various industries We are seeking a talented and experienced Java Developer to join our team Job Description We are seeking a skilled Java Developer with at least 3 years of experience in building highperforming scal able enterprisegrade applications You will be part of a talented software team that works on missioncritical applications Your roles and responsibilities will include managing Java/Java EE application development while providing expertise in the full software development lifecycle Responsibilities •Designing implementing and maintaining Java applications that are often highvolume and low latency required for missioncritical systems •Delivering high availability and performance •Contributing to all phases of the development lifecycle •Writing welldesigned efficient and testable code •Conducting software analysis programming testing and debugging •Ensuring designs comply with specifications •Preparing and producing releases of software components •Supporting continuous improvement by investigating alternatives and technologies and presenting these for architectural review Requirements •BS/MS degree in Computer Science Engineering or a related subject •Proven handson Software Development experience •Proven working experience in Java development •Handson experience in designing and developing applications using Java EE platforms •ObjectOriented Analysis and design using common design patterns •Profound insight of Java and JEE internals Classloading Memory Management Transaction man agement etc 1 •Excellent knowledge of Relational Databases SQL and ORM technologies JPA2 Hibernate •Experience in developing web applications using at least one popular web framework JSF Wicket GWT Spring MVC •Experience with testdriven development Benefits •Competitive salary package •Health dental and vision insurance •Retirement savings plan •Professional development opportunities •Flexible work hours Tech Solutions is proud to be an equal opportunity employer We celebrate diversity and are committed to creating an inclusive environment for all employees How to Apply To apply please submit your resume and a brief explanation of your relevant experience to 2\"\n", + "config = read_config(\"config.yml\")\n", + "if not config:\n", + " print(\"Cannot process this as there is no config.yml\")\n", + "else:\n", + " qdrant_search = QdrantSearch(resumes, job_description)\n", "\n", - "qdrant_search = QdrantSearch(resumes, job_description)\n", + " qdrant_search.update_qdrant()\n", "\n", - "qdrant_search.update_qdrant()\n", - "\n", - "results = qdrant_search.search()\n", - "for r in results:\n", - " print(r)" + " results = qdrant_search.search()\n", + " for r in results:\n", + " print(r)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "rlP3s5euo435", - "outputId": "3f4f15b6-d446-4491-d4d5-d9ba14a2a145" + "outputId": "389c00e7-8cd1-4dd6-f517-d923e3c4bf2a" }, - "execution_count": null, + "execution_count": 10, "outputs": [ { "output_type": "stream", @@ -162,6 +157,15 @@ ] } ] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "WFdXngZkEyOm" + }, + "execution_count": null, + "outputs": [] } ] } \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 6be616f8..f6359cb8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -108,4 +108,5 @@ wasabi==1.1.2 watchdog==3.0.0 zipp==3.16.2 -cohere~=4.19.2 \ No newline at end of file +cohere~=4.19.2 +qdrant-client \ No newline at end of file diff --git a/scripts/similarity/get_similarity_score.py b/scripts/similarity/get_similarity_score.py new file mode 100644 index 00000000..feec9dfd --- /dev/null +++ b/scripts/similarity/get_similarity_score.py @@ -0,0 +1,174 @@ +import json +import logging +import os + +import cohere +import yaml +from qdrant_client import QdrantClient, models +from qdrant_client.http.models import Batch + +logging.basicConfig( + filename='app_similarity_score.log', + filemode='w', + level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" +) + +logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) + +console_handler = logging.StreamHandler() +formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") +console_handler.setFormatter(formatter) +console_handler.setLevel(logging.DEBUG) + +file_handler = logging.FileHandler("app_similarity_score.log") +file_handler.setLevel(logging.DEBUG) +file_handler.setFormatter(formatter) + +logger.addHandler(file_handler) +logger.addHandler(console_handler) + + +def find_path(folder_name): + curr_dir = os.getcwd() + while True: + if folder_name in os.listdir(curr_dir): + return os.path.join(curr_dir, folder_name) + else: + parent_dir = os.path.dirname(curr_dir) + if parent_dir == '/': + break + curr_dir = parent_dir + raise ValueError(f"Folder '{folder_name}' not found.") + + +cwd = find_path('Resume-Matcher') +READ_RESUME_FROM = os.path.join(cwd, 'Data', 'Processed', 'Resumes') +READ_JOB_DESCRIPTION_FROM = os.path.join(cwd, 'Data', 'Processed', 'JobDescription') +config_path = os.path.join(cwd, "scripts", "similarity") + + +def read_config(filepath): + try: + with open(filepath) as f: + config = yaml.safe_load(f) + return config + except FileNotFoundError as e: + logger.error(f"Configuration file {filepath} not found: {e}") + except yaml.YAMLError as e: + logger.error(f"Error parsing YAML in configuration file {filepath}: {e}", exc_info=True) + except Exception as e: + logger.error(f"Error reading configuration file {filepath}: {e}") + return None + + +def read_doc(path): + with open(path) as f: + try: + data = json.load(f) + except Exception as e: + logger.error(f'Error reading JSON file: {e}') + data = {} + return data + + +class QdrantSearch: + def __init__(self, resumes, jd): + config = read_config(config_path + "/config.yml") + self.cohere_key = config['cohere']['api_key'] + self.qdrant_key = config['qdrant']['api_key'] + self.qdrant_url = config['qdrant']['url'] + self.resumes = resumes + self.jd = jd + self.cohere = cohere.Client(self.cohere_key) + + self.qdrant = QdrantClient( + url=self.qdrant_url, + api_key=self.qdrant_key, + ) + + vector_size = 4096 + self.qdrant.recreate_collection( + collection_name="collection_resume_matcher", + vectors_config=models.VectorParams( + size=vector_size, + distance=models.Distance.COSINE + ) + ) + + self.logger = logging.getLogger(self.__class__.__name__) + + self.logger.addHandler(console_handler) + self.logger.addHandler(file_handler) + + def get_embedding(self, text): + try: + embeddings = self.cohere.embed([text], "large").embeddings + return list(map(float, embeddings[0])), len(embeddings[0]) + except Exception as e: + self.logger.error(f"Error getting embeddings: {e}", exc_info=True) + + def update_qdrant(self): + vectors = [] + ids = [] + for i, resume in enumerate(self.resumes): + vector, size = self.get_embedding(resume) + vectors.append(vector) + ids.append(i) + try: + self.qdrant.upsert( + collection_name="collection_resume_matcher", + points=Batch( + ids=ids, + vectors=vectors, + payloads=[{"text": resume} for resume in self.resumes] + + ) + ) + except Exception as e: + self.logger.error(f"Error upserting the vectors to the qdrant collection: {e}", exc_info=True) + + def search(self): + vector, _ = self.get_embedding(self.jd) + + hits = self.qdrant.search( + collection_name="collection_resume_matcher", + query_vector=vector, + limit=30 + ) + results = [] + for hit in hits: + result = { + 'text': str(hit.payload)[:30], + 'score': hit.score + } + results.append(result) + + return results + + +def get_similarity_score(resume_string, job_description_string): + logger.info("Started getting similarity score") + qdrant_search = QdrantSearch([resume_string], job_description_string) + qdrant_search.update_qdrant() + search_result = qdrant_search.search() + logger.info("Finished getting similarity score") + return search_result + + +if __name__ == "__main__": + # To give your custom resume use this code + resume_dict = read_config( + READ_RESUME_FROM + "/Resume-bruce_wayne_fullstack.pdf4783d115-e6fc-462e-ae4d-479152884b28.json") + job_dict = read_config( + READ_JOB_DESCRIPTION_FROM + "/JobDescription-job_desc_full_stack_engineer_pdf4de00846-a4fe-4fe5-a4d7" + "-2a8a1b9ad020.json") + resume_keywords = resume_dict["extracted_keywords"] + job_description_keywords = job_dict["extracted_keywords"] + + resume_string = ' '.join(resume_keywords) + jd_string = ' '.join(job_description_keywords) + final_result = get_similarity_score(resume_string, jd_string) + for r in final_result: + print(r) diff --git a/streamlit_app.py b/streamlit_app.py index a2128482..91812aa0 100644 --- a/streamlit_app.py +++ b/streamlit_app.py @@ -1,15 +1,22 @@ -import networkx as nx +import json +import os from typing import List -import streamlit as st + +import networkx as nx +import nltk import pandas as pd -import json import plotly.express as px import plotly.graph_objects as go -from scripts.utils.ReadFiles import get_filenames_from_dir -from streamlit_extras import add_vertical_space as avs +import streamlit as st from annotated_text import annotated_text, parameters +from streamlit_extras import add_vertical_space as avs from streamlit_extras.badges import badge -import nltk + +from scripts.similarity.get_similarity_score import get_similarity_score, find_path, read_config +from scripts.utils.ReadFiles import get_filenames_from_dir + +cwd = find_path('Resume-Matcher') +config_path = os.path.join(cwd, "scripts", "similarity") try: nltk.data.find('tokenizers/punkt') @@ -32,7 +39,7 @@ def create_star_graph(nodes_and_weights, title): # Add nodes and edges with weights to the graph for node, weight in nodes_and_weights: G.add_node(node) - G.add_edge(central_node, node, weight=weight*100) + G.add_edge(central_node, node, weight=weight * 100) # Get position layout for nodes pos = nx.spring_layout(G) @@ -144,15 +151,15 @@ def tokenize_string(input_string): if len(resume_names) > 1: st.write("There are", len(resume_names), - " resumes present. Please select one from the menu below:") - output = st.slider('Select Resume Number', 0, len(resume_names)-1, 0) + " resumes present. Please select one from the menu below:") + output = st.slider('Select Resume Number', 0, len(resume_names) - 1, 0) else: st.write("There is 1 resume present") avs.add_vertical_space(5) st.write("You have selected ", resume_names[output], " printing the resume") -selected_file = read_json("Data/Processed/Resumes/"+resume_names[output]) +selected_file = read_json("Data/Processed/Resumes/" + resume_names[output]) avs.add_vertical_space(2) st.markdown("#### Parsed Resume Data") @@ -181,7 +188,7 @@ def tokenize_string(input_string): # Create the dictionary keyword_dict = {} for keyword, value in selected_file['keyterms']: - keyword_dict[keyword] = value*100 + keyword_dict[keyword] = value * 100 fig = go.Figure(data=[go.Table(header=dict(values=["Keyword", "Value"], font=dict(size=12), @@ -207,9 +214,9 @@ def tokenize_string(input_string): output = 0 if len(job_descriptions) > 1: st.write("There are", len(job_descriptions), - " resumes present. Please select one from the menu below:") + " resumes present. Please select one from the menu below:") output = st.slider('Select Job Description Number', - 0, len(job_descriptions)-1, 0) + 0, len(job_descriptions) - 1, 0) else: st.write("There is 1 job description present") @@ -218,7 +225,7 @@ def tokenize_string(input_string): st.write("You have selected ", job_descriptions[output], " printing the job description") selected_jd = read_json( - "Data/Processed/JobDescription/"+job_descriptions[output]) + "Data/Processed/JobDescription/" + job_descriptions[output]) avs.add_vertical_space(2) st.markdown("#### Job Description") @@ -244,7 +251,7 @@ def tokenize_string(input_string): # Create the dictionary keyword_dict = {} for keyword, value in selected_jd['keyterms']: - keyword_dict[keyword] = value*100 + keyword_dict[keyword] = value * 100 fig = go.Figure(data=[go.Table(header=dict(values=["Keyword", "Value"], font=dict(size=12), @@ -265,6 +272,20 @@ def tokenize_string(input_string): avs.add_vertical_space(3) +config_file_path = config_path + "/config.yml" +if os.path.exists(config_file_path): + config_data = read_config(config_file_path) + if config_data: + print("Config file parsed successfully:") + resume_string = ' '.join(selected_file["extracted_keywords"]) + jd_string = ' '.join(selected_jd["extracted_keywords"]) + result = get_similarity_score(resume_string, jd_string) + similarity_score = result[0]["score"] + st.write("Similarity Score obtained for the resume and job description is:", similarity_score) +else: + print("Config file does not exist.") + + st.title(':blue[Resume Matcher]') st.subheader( 'Free and Open Source ATS to help your resume pass the screening stage.')