From 03695e5803ee4d685bef647df02a8b64ceaacdf3 Mon Sep 17 00:00:00 2001 From: Thomas Stadelmann Date: Sat, 19 Feb 2022 17:26:02 +0100 Subject: [PATCH 1/3] add docker-compose override file for traffic monitoring --- docker-compose.mitm.yml | 34 +++++++++++++++++++++++ haystack/document_stores/elasticsearch.py | 25 ++++++++++++++--- 2 files changed, 55 insertions(+), 4 deletions(-) create mode 100644 docker-compose.mitm.yml diff --git a/docker-compose.mitm.yml b/docker-compose.mitm.yml new file mode 100644 index 0000000000..2422d55e8e --- /dev/null +++ b/docker-compose.mitm.yml @@ -0,0 +1,34 @@ +# docker-compose override file to enable HTTP traffic monitoring between ui, haystack-api and elasticsearch using mitmproxy. +# After startup you can find mitmweb under localhost:8081 in your browser. +# Usage: docker-compose -f docker-compose[-gpu].yml -f docker-compose.mitm.yml up +version: "3" +services: + haystack-api: + environment: + - HTTP_PROXY=http://mitmproxy:8080 + - HTTPS_PROXY=https://mitmproxy:8080 + - REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt + - DOCUMENTSTORE_PARAMS_USE_SYSTEM_PROXY=true + command: "/bin/bash -c 'sleep 10 + && wget -e http_proxy=mitmproxy:8080 -O /usr/local/share/ca-certificates/mitmproxy.crt http://mitm.it/cert/pem + && update-ca-certificates + && gunicorn rest_api.application:app -b 0.0.0.0 -k uvicorn.workers.UvicornWorker --workers 2 --timeout 180'" + depends_on: + - mitmproxy + ui: + environment: + - HTTP_PROXY=http://mitmproxy:8080 + - HTTPS_PROXY=https://mitmproxy:8080 + - REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt + command: "/bin/bash -c 'sleep 15 + && wget -e http_proxy=mitmproxy:8080 -O /usr/local/share/ca-certificates/mitmproxy.crt http://mitm.it/cert/pem + && update-ca-certificates + && python -m streamlit run ui/webapp.py'" + depends_on: + - mitmproxy + mitmproxy: + image: "mitmproxy/mitmproxy:latest" + ports: + - 8080:8080 + - 8081:8081 + command: "mitmweb --web-host 0.0.0.0 --set block_global=false" diff --git a/haystack/document_stores/elasticsearch.py b/haystack/document_stores/elasticsearch.py index 67a45e7227..4b56e01df4 100644 --- a/haystack/document_stores/elasticsearch.py +++ b/haystack/document_stores/elasticsearch.py @@ -1,19 +1,20 @@ from modulefinder import Module -from typing import List, Optional, Union, Dict, Any, Generator +from typing import List, Optional, Type, Union, Dict, Any, Generator import json import logging +import os import time from copy import deepcopy from string import Template -from collections import defaultdict +import elasticsearch import numpy as np from scipy.special import expit from tqdm.auto import tqdm try: - from elasticsearch import Elasticsearch, RequestsHttpConnection + from elasticsearch import Elasticsearch, RequestsHttpConnection, Connection, Urllib3HttpConnection from elasticsearch.helpers import bulk, scan from elasticsearch.exceptions import RequestError except (ImportError, ModuleNotFoundError) as ie: @@ -65,6 +66,7 @@ def __init__( skip_missing_embeddings: bool = True, synonyms: Optional[List] = None, synonym_type: str = "synonym", + use_system_proxy: bool = False ): """ A DocumentStore using Elasticsearch to store and query the documents for our search. @@ -137,6 +139,7 @@ def __init__( :param synonym_type: Synonym filter type can be passed. Synonym or Synonym_graph to handle synonyms, including multi-word synonyms correctly during the analysis process. More info at https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-graph-tokenfilter.html + :param use_system_proxy: Whether to use system proxy. """ # save init parameters to enable export of component config as YAML @@ -172,6 +175,7 @@ def __init__( skip_missing_embeddings=skip_missing_embeddings, synonyms=synonyms, synonym_type=synonym_type, + use_system_proxy=use_system_proxy ) self.client = self._init_elastic_client( @@ -186,6 +190,7 @@ def __init__( ca_certs=ca_certs, verify_certs=verify_certs, timeout=timeout, + use_system_proxy=use_system_proxy ) # configure mappings to ES fields that will be used for querying / displaying results @@ -251,6 +256,7 @@ def _init_elastic_client( ca_certs: Optional[str], verify_certs: bool, timeout: int, + use_system_proxy: bool ) -> Elasticsearch: hosts = self._prepare_hosts(host, port) @@ -258,6 +264,10 @@ def _init_elastic_client( if (api_key or api_key_id) and not (api_key and api_key_id): raise ValueError("You must provide either both or none of `api_key_id` and `api_key`") + connection_class: Type[Connection] = Urllib3HttpConnection + if use_system_proxy: + connection_class = RequestsHttpConnection + if api_key: # api key authentication client = Elasticsearch( @@ -267,6 +277,7 @@ def _init_elastic_client( ca_certs=ca_certs, verify_certs=verify_certs, timeout=timeout, + connection_class=connection_class, ) elif aws4auth: # aws elasticsearch with IAM @@ -288,11 +299,17 @@ def _init_elastic_client( ca_certs=ca_certs, verify_certs=verify_certs, timeout=timeout, + connection_class=connection_class, ) else: # there is no authentication for this elasticsearch instance client = Elasticsearch( - hosts=hosts, scheme=scheme, ca_certs=ca_certs, verify_certs=verify_certs, timeout=timeout + hosts=hosts, + scheme=scheme, + ca_certs=ca_certs, + verify_certs=verify_certs, + timeout=timeout, + connection_class=connection_class, ) # Test connection From 28f8578e17b9812fbd8fb12bb41078a346357966 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 21 Feb 2022 12:50:19 +0000 Subject: [PATCH 2/3] Update Documentation & Code Style --- haystack/document_stores/elasticsearch.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/haystack/document_stores/elasticsearch.py b/haystack/document_stores/elasticsearch.py index 4b56e01df4..beedee4c3d 100644 --- a/haystack/document_stores/elasticsearch.py +++ b/haystack/document_stores/elasticsearch.py @@ -66,7 +66,7 @@ def __init__( skip_missing_embeddings: bool = True, synonyms: Optional[List] = None, synonym_type: str = "synonym", - use_system_proxy: bool = False + use_system_proxy: bool = False, ): """ A DocumentStore using Elasticsearch to store and query the documents for our search. @@ -175,7 +175,7 @@ def __init__( skip_missing_embeddings=skip_missing_embeddings, synonyms=synonyms, synonym_type=synonym_type, - use_system_proxy=use_system_proxy + use_system_proxy=use_system_proxy, ) self.client = self._init_elastic_client( @@ -190,7 +190,7 @@ def __init__( ca_certs=ca_certs, verify_certs=verify_certs, timeout=timeout, - use_system_proxy=use_system_proxy + use_system_proxy=use_system_proxy, ) # configure mappings to ES fields that will be used for querying / displaying results @@ -256,7 +256,7 @@ def _init_elastic_client( ca_certs: Optional[str], verify_certs: bool, timeout: int, - use_system_proxy: bool + use_system_proxy: bool, ) -> Elasticsearch: hosts = self._prepare_hosts(host, port) From b620bc5de0ac4b8cd698c87048c720437642ad6a Mon Sep 17 00:00:00 2001 From: Thomas Stadelmann Date: Mon, 21 Feb 2022 13:59:46 +0100 Subject: [PATCH 3/3] remove faulty imports --- haystack/document_stores/elasticsearch.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/haystack/document_stores/elasticsearch.py b/haystack/document_stores/elasticsearch.py index 4b56e01df4..c7451ada3f 100644 --- a/haystack/document_stores/elasticsearch.py +++ b/haystack/document_stores/elasticsearch.py @@ -3,11 +3,9 @@ import json import logging -import os import time from copy import deepcopy from string import Template -import elasticsearch import numpy as np from scipy.special import expit