Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add spam classifier with google gemini flash 2.0 experimental and check for spam earlier #58

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 25 additions & 18 deletions docassemble/GithubFeedbackForm/data/questions/feedback.yml
Original file line number Diff line number Diff line change
Expand Up @@ -311,26 +311,33 @@ need:
- package_version
- filename
code: |
if not task_performed('issue noted', persistent=True):
saved_uuid
if showifdef('would_be_on_panel', False):
add_panel_participant(panel_email)
if should_send_to_github:
issue_url
if issue_url:
if saved_uuid:
set_feedback_github_url(saved_uuid, issue_url)
else:
al_error_email
log(f"This form was not able to add an issue on the {github_user}/{github_repo} repo. Check your config.")
if al_error_email and not is_likely_spam(issue_template.content):
log(f"Unable to create issue on repo {github_repo}, falling back to emailing {al_error_email}")
send_email(to=al_error_email, subject=f"{github_repo} - {issue_template.subject_as_html(trim=True)}", template=issue_template)
else:
log(f"~~~USER FEEDBACK~~~ {github_repo} -{issue_template.subject_as_html(trim=True)} - {issue_template.content_as_html(trim=True)}")
if is_likely_spam(issue_template.content):
log("Not saving feedback because it looks like spam")
mark_task_as_performed('issue noted', persistent=True)
issue_url = None
saved_uuid = None
note_issue = False
nonprofittechy marked this conversation as resolved.
Show resolved Hide resolved
else:
log("Already sent feedback to github from a feedback interview, not going to send again")
if not task_performed('issue noted', persistent=True):
saved_uuid
if showifdef('would_be_on_panel', False):
add_panel_participant(panel_email)
if should_send_to_github:
issue_url
if issue_url:
if saved_uuid:
set_feedback_github_url(saved_uuid, issue_url)
else:
al_error_email
log(f"This form was not able to add an issue on the {github_user}/{github_repo} repo. Check your config.")
if al_error_email and not is_likely_spam(issue_template.content):
log(f"Unable to create issue on repo {github_repo}, falling back to emailing {al_error_email}")
send_email(to=al_error_email, subject=f"{github_repo} - {issue_template.subject_as_html(trim=True)}", template=issue_template)
else:
log(f"~~~USER FEEDBACK~~~ {github_repo} -{issue_template.subject_as_html(trim=True)} - {issue_template.content_as_html(trim=True)}")
mark_task_as_performed('issue noted', persistent=True)
else:
log("Already sent feedback to github from a feedback interview, not going to send again")

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can likely simplify the nested ifs much more. Current structure, as I understand it:

if feedback looks like spam:
    log
    mark task as done
    set some values
else:
    if task not yet performed:
        prepare saved_uuid
        if user should be added:
              add user
         if feedback should be sent to github:
                prepare
                if url:
                   if saved_uuid:
                        link
                else:
                      log
                      if error email AND not spam:
                            log
                            send
                      else:
                            log
               else:
                    log
set note_issue to true

What we could do, to reduce code duplication and the logic branches:

if feedback looks like spam:
    log
    mark as done 
    save values
    return
if task already performed:
    log
    return
    
if should add user to panel:
    add user

if send feedback to github:
     create issue
     if issue_url and saved_uuid:
          link to issue
     else:
         log error
         if error email configured:
               send
          else:
                log

mark as done
set note_issue to true
    

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can't use a return statement here in a Docassemble code block, unfortunately! I can take another look at simplifying this--I just wanted to be careful to scope my change to be as small as possible.

note_issue = True
---
code: |
Expand Down
77 changes: 74 additions & 3 deletions docassemble/GithubFeedbackForm/github_issue.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@
from docassemble.base.util import log, get_config, interview_url
import re

try:
import google.generativeai as genai
except:
nonprofittechy marked this conversation as resolved.
Show resolved Hide resolved
pass

# reference: https://gist.github.com/JeffPaine/3145490
# https://docs.github.com/en/free-pro-team@latest/rest/reference/issues#create-an-issue

Expand All @@ -16,6 +21,7 @@
"make_github_issue",
"feedback_link",
"is_likely_spam",
"is_likely_spam_from_genai",
"prefill_github_issue_url",
]
USERNAME = get_config("github issues", {}).get("username")
Expand Down Expand Up @@ -168,8 +174,72 @@ def feedback_link(
)


def is_likely_spam_from_genai(
body: Optional[str],
context: Optional[str] = None,
gemini_api_key: Optional[str] = None,
model="gemini-2.0-flash-exp",
) -> bool:
"""
Check if the body of the issue is likely spam with the help of Google Gemini Flash experimental.

Args:
body (Optional[str]): the body of the issue
context (Optional[str]): the context of the issue to help rate it as spam or not, defaults to a guided interview in the legal context
gemini_api_key (Optional[str]): the token for the Google Gemini Flash API, can be specified in the global config as `google gemini api key`
model (Optional[str]): the model to use for the spam detection, defaults to "gemini-2.0-flash-exp", can be specified in the global config
as `github issues: spam model`
"""
if not body:
return False

if not context:
context = "a guided interview in the legal context"

Comment on lines +196 to +198

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To improve readability when setting defaults, we can use or. For example:

context = context or "a guided interview in the legal context"
gemini_api_key = gemini_api_key or get_config("google gemini api key")
... etc ...

if not gemini_api_key:
gemini_api_key = get_config("google gemini api key")

if not gemini_api_key:
log("Not using Google Gemini Flash to check for spam: no token provided")
return False

if not model:
model = get_config("github issues", {}).get(
"spam model", "gemini-2.0-flash-exp"
)

try:
genai.configure(api_key=gemini_api_key)
model = genai.GenerativeModel(
model_name=model,
system_instruction=f"""
You are reviewing a feedback form for {context}. Your job is to allow as many
relevant feedback responses as possible while filtering out irrelevant and spam feedback,
especially targeted advertising that isn't pointing out a problem on the guided interview.

Rate the user's feedback as 'spam' or 'not spam' based on the context of the guided interview.
Answer only with the exact keywords: 'spam' or 'not spam'.
""",
)
except Exception as e:
log(f"Error configuring Google Gemini Flash: {e}")
return False

try:
response = model.generate_content(body)
if response.text.strip() == "spam":
return True
Comment on lines +228 to +231

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I get the sense that this would be readable if we folded it into the other try. I'm not sure there's a need to keep them distinct. We can leverage using specific exception types to do this. The structure would change to something like this:

try:
    attempt configuration
    generate the response
except UseANameException as e:
    log error configuring 
    return False
except Exception as e:
    log generic error 

except Exception as e:
log(f"Error using Google Gemini Flash: {e}")
return False
return False


def is_likely_spam(
body: Optional[str], keywords: Optional[List[str]] = None, filter_urls: bool = True
body: Optional[str],
keywords: Optional[List[str]] = None,
filter_urls: bool = True,
model: Optional[str] = None,
) -> bool:
"""
Check if the body of the issue is likely spam based on a set of keywords and URLs.
Expand All @@ -179,9 +249,10 @@ def is_likely_spam(

Args:
body (Optional[str]): the body of the issue
keywords (Optional[List[str]]): a list of keywords that are likely spam, defaults to a set of keywords
keywords (Optional[List[str]]): a list of additional keywords that are likely spam, defaults to a set of keywords
from the global configuration under the `github issues: spam keywords` key
"""

_urls = ["leadgeneration.com", "leadmagnet.com"]
_keywords = [
"100 times more effective",
Expand Down Expand Up @@ -244,7 +315,7 @@ def is_likely_spam(
if re.search(url_regex, body):
return True

return False
return is_likely_spam_from_genai(body, model=model)


def prefill_github_issue_url(
Expand Down
4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,7 @@ exclude = '''(?x)(
[[tool.mypy.overrides]]
module = "docassemble.base.*"
ignore_missing_imports = true

[[tool.mypy.overrides]]
module = "google.*"
ignore_missing_imports = true
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def find_package_data(where='.', package='', exclude=standard_exclude, exclude_d
url='https://courtformsonline.org',
packages=find_packages(),
namespace_packages=['docassemble'],
install_requires=['docassemble.ALToolbox>=0.6.0'],
install_requires=['docassemble.ALToolbox>=0.6.0', 'google-generativeai'],
zip_safe=False,
package_data=find_package_data(where='docassemble/GithubFeedbackForm/', package='docassemble.GithubFeedbackForm'),
)
Expand Down
Loading