Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Duckduckgosearch #1388

Merged
merged 24 commits into from
Jul 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
14fc219
Updated conversation_api.md document/upload
guoyuhao2330 May 15, 2024
7be4682
Merge branch 'infiniflow:main' into main
guoyuhao2330 May 16, 2024
ec50e1f
Merge branch 'infiniflow:main' into main
guoyuhao2330 May 17, 2024
cbc8260
Merge branch 'infiniflow:main' into main
guoyuhao2330 May 17, 2024
6c84001
Merge branch 'infiniflow:main' into main
guoyuhao2330 May 20, 2024
a7f7ee4
Merge branch 'infiniflow:main' into main
guoyuhao2330 May 22, 2024
75e1187
Merge branch 'infiniflow:main' into main
guoyuhao2330 May 22, 2024
bb042ae
Merge branch 'infiniflow:main' into main
guoyuhao2330 May 22, 2024
825b6ce
Merge branch 'infiniflow:main' into main
guoyuhao2330 May 22, 2024
545cc0b
Merge branch 'infiniflow:main' into main
guoyuhao2330 May 23, 2024
0ce1a39
Merge branch 'infiniflow:main' into main
guoyuhao2330 May 27, 2024
8ffe3ad
Merge branch 'infiniflow:main' into main
guoyuhao2330 May 29, 2024
7f5ce7e
Merge branch 'infiniflow:main' into main
guoyuhao2330 May 30, 2024
f4686fb
Merge branch 'infiniflow:main' into main
guoyuhao2330 May 31, 2024
1222da4
Add file discord_svr.py
guoyuhao2330 May 31, 2024
d102fe7
Delete rag/svr/discord_svr.py
guoyuhao2330 May 31, 2024
17b20e5
Merge branch 'infiniflow:main' into main
guoyuhao2330 Jun 7, 2024
7088707
Merge branch 'infiniflow:main' into main
guoyuhao2330 Jun 24, 2024
9feca35
Merge branch 'infiniflow:main' into main
guoyuhao2330 Jul 4, 2024
55b3316
Merge branch 'infiniflow:main' into main
guoyuhao2330 Jul 5, 2024
0e486fd
Update __init__.py
guoyuhao2330 Jul 5, 2024
246608e
Add files via upload
guoyuhao2330 Jul 5, 2024
efe326b
Update baidu.py
guoyuhao2330 Jul 5, 2024
936cc35
Update duckduckgosearch.py
guoyuhao2330 Jul 5, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions graph/component/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from .rewrite import RewriteQuestion, RewriteQuestionParam
from .keyword import KeywordExtract, KeywordExtractParam
from .baidu import Baidu, BaiduParam
from .duckduckgosearch import DuckDuckGoSearch, DuckDuckGoSearchParam


def component_class(class_name):
Expand Down
10 changes: 5 additions & 5 deletions graph/component/baidu.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,12 @@ def _run(self, history, **kwargs):
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'}
response = requests.get(url=url, headers=headers)

baidu_res = re.findall(r'"contentText":"(.*?)"', response.text)
url_res = re.findall(r"'url': \\\"(.*?)\\\"}", response.text)
for i in range(min(len(baidu_res), len(url_res))):
baidu_res[i] += '<a>' + url_res[i] + '</a>'

del url_res
title_res = re.findall(r"'title': \\\"(.*?)\\\",\\n", response.text)
body_res = re.findall(r"\"contentText\":\"(.*?)\"", response.text)
baidu_res = [re.sub('<em>|</em>', '', '<a href="' + url + '">' + title + '</a> ' + body) for url, title, body
in zip(url_res, title_res, body_res)]
del body_res, url_res, title_res

br = pd.DataFrame(baidu_res, columns=['content'])
print(">>>>>>>>>>>>>>>>>>>>>>>>>>\n", br)
Expand Down
62 changes: 62 additions & 0 deletions graph/component/duckduckgosearch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import random
from abc import ABC
from functools import partial
from duckduckgosearch import DDGS
import pandas as pd

from graph.component.base import ComponentBase, ComponentParamBase


class DuckDuckGoSearchParam(ComponentParamBase):
"""
Define the DuckDuckGoSearch component parameters.
"""

def __init__(self):
super().__init__()
self.top_n = 10
self.channel = "text"

def check(self):
self.check_positive_integer(self.top_n, "Top N")
self.check_valid_value(self.channel, "Web Search or News", ["text", "news"])


class DuckDuckGoSearch(ComponentBase, ABC):
component_name = "DuckDuckGoSearch"

def _run(self, history, **kwargs):
ans = self.get_input()
ans = " - ".join(ans["content"]) if "content" in ans else ""
if not ans:
return Baidu.be_output(self._param.no)

if self.channel == "text":
with DDGS() as ddgs:
# {'title': '', 'href': '', 'body': ''}
duck_res = ['<a href="' + i["href"] + '">' + i["title"] + '</a> ' + i["body"] for i in
ddgs.text(ans, max_results=self._param.top_n)]
elif self.channel == "news":
with DDGS() as ddgs:
# {'date': '', 'title': '', 'body': '', 'url': '', 'image': '', 'source': ''}
duck_res = ['<a href="' + i["url"] + '">' + i["title"] + '</a> ' + i["body"] for i in
ddgs.news(ans, max_results=self._param.top_n)]

dr = pd.DataFrame(duck_res, columns=['content'])
print(">>>>>>>>>>>>>>>>>>>>>>>>>>\n", dr)
return dr