diff --git a/config.py b/config.py index dae83e6..aa4c903 100644 --- a/config.py +++ b/config.py @@ -19,6 +19,7 @@ OPEN_SEARCH_URL = 'http://opensearch-node1:9200' OPEN_SEARCH_INDEX = 'scan-explorer' +OPEN_SEARCH_AGG_BUCKET_LIMIT = 10000 ADS_SEARCH_SERVICE_URL = 'https://api.adsabs.harvard.edu/v1/search/query' ADS_SEARCH_SERVICE_TOKEN = '' diff --git a/scan_explorer_service/open_search.py b/scan_explorer_service/open_search.py index 2fad6b3..039a76c 100644 --- a/scan_explorer_service/open_search.py +++ b/scan_explorer_service/open_search.py @@ -37,7 +37,7 @@ def append_aggregate(query: dict, agg_field: EsFields, page: int, size: int, sor } }, "ids": { - "terms": {"field": agg_field.value, "size": 10000}, + "terms": {"field": agg_field.value, "size": current_app.config.get("OPEN_SEARCH_AGG_BUCKET_LIMIT", 10000)}, "aggs": { "bucket_sort": { "bucket_sort": { @@ -97,7 +97,7 @@ def text_search_highlight(text: str, filter_field: EsFields, filter_value: str): for hit in es_search(query)['hits']['hits']: yield { "page_id": hit['_source']['page_id'], - "highlight": hit['highlight']['text'] + "highlight": hit.get('highlight', {}).get('text', []) } def set_page_ocr_fields(query: dict) -> dict: diff --git a/scan_explorer_service/tests/test_metadata.py b/scan_explorer_service/tests/test_metadata.py index 35b8afa..dc96c6b 100644 --- a/scan_explorer_service/tests/test_metadata.py +++ b/scan_explorer_service/tests/test_metadata.py @@ -356,5 +356,40 @@ def test_get_page_ocr_article_no_pages(self, OpenSearch): self.assertIn('no pages', data['message'].lower()) +class TestOpenSearchHighlight(TestCaseDatabase): + + def create_app(self): + from scan_explorer_service.app import create_app + return create_app(**{ + 'SQLALCHEMY_DATABASE_URI': self.postgresql_url, + 'OPEN_SEARCH_URL': 'http://localhost:1234', + 'OPEN_SEARCH_INDEX': 'test', + 'SQLALCHEMY_ECHO': False, + 'TESTING': True, + 'PROPAGATE_EXCEPTIONS': True, + 'TRAP_BAD_REQUEST_ERRORS': True, + 'PRESERVE_CONTEXT_ON_EXCEPTION': False + }) + + def setUp(self): + Base.metadata.drop_all(bind=self.app.db.engine) + Base.metadata.create_all(bind=self.app.db.engine) + + @patch('scan_explorer_service.open_search.es_search') + def test_text_search_highlight_missing_highlight_field(self, mock_es_search): + mock_es_search.return_value = { + 'hits': { + 'hits': [ + {'_source': {'page_id': 'page1'}}, + ] + } + } + from scan_explorer_service.open_search import text_search_highlight, EsFields + results = list(text_search_highlight('test query', EsFields.volume_id, 'vol1')) + self.assertEqual(len(results), 1) + self.assertEqual(results[0]['page_id'], 'page1') + self.assertEqual(results[0]['highlight'], []) + + if __name__ == '__main__': unittest.main() diff --git a/scan_explorer_service/tests/test_proxy.py b/scan_explorer_service/tests/test_proxy.py index f3accb5..27a9f03 100644 --- a/scan_explorer_service/tests/test_proxy.py +++ b/scan_explorer_service/tests/test_proxy.py @@ -88,6 +88,9 @@ def __init__(self, data, status_code, headers): def json(self): return self.json_data + def close(self): + pass + if 'notfound' in args[1]: return MockResponse({}, 401, {}) elif 'badrequest' in args[1]: @@ -111,6 +114,20 @@ def test_get_image(self, mock_request): response = image_proxy('badrequest-~image-~path') assert(response.status_code == 400) + @patch('scan_explorer_service.views.image_proxy.requests.request') + def test_image_proxy_closes_upstream_response(self, mock_request): + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.headers = {} + mock_response.raw.stream.return_value = [b'chunk1', b'chunk2'] + mock_request.return_value = mock_response + + url = url_for('proxy.image_proxy', path='some-~image-~path') + response = self.client.get(url) + self.assertEqual(response.status_code, 200) + response.close() + mock_response.close.assert_called() + @patch('requests.request', side_effect=mocked_request) def test_get_thumbnail(self, mock_request): @@ -233,6 +250,8 @@ def __init__(self, d, sc, h): self.raw = Raw(d) self.status_code = sc self.headers = h or {} + def close(self): + pass return MockResponse(data, status_code, headers or {}) @patch('requests.request') diff --git a/scan_explorer_service/views/image_proxy.py b/scan_explorer_service/views/image_proxy.py index 0b2d815..a3bbe0e 100644 --- a/scan_explorer_service/views/image_proxy.py +++ b/scan_explorer_service/views/image_proxy.py @@ -52,7 +52,10 @@ def image_proxy(path): def generate(): for chunk in r.raw.stream(decode_content=False): yield chunk - return Response(generate(), status=r.status_code, headers=headers) + + resp = Response(generate(), status=r.status_code, headers=headers) + resp.call_on_close(r.close) + return resp @advertise(scopes=['api'], rate_limit=[5000, 3600*24]) diff --git a/scan_explorer_service/views/metadata.py b/scan_explorer_service/views/metadata.py index 4ea1366..bfe9dc7 100644 --- a/scan_explorer_service/views/metadata.py +++ b/scan_explorer_service/views/metadata.py @@ -26,7 +26,7 @@ def article_extra(bibcode: str): try: params = {'q': f'bibcode:{bibcode}', 'fl':'title,author'} headers = {'Authorization': f'Bearer {auth_token}'} - response = requests.get(ads_search_service, params, headers=headers).json() + response = requests.get(ads_search_service, params, headers=headers, timeout=5).json() docs = response.get('response').get('docs') if docs: return docs[0] @@ -61,7 +61,7 @@ def put_article(): article = Article(**json) article_overwrite(session, article) return jsonify({'id': article.bibcode}), 200 - except: + except Exception: session.rollback() return jsonify(message='Failed to create article'), 500 else: @@ -130,7 +130,7 @@ def put_collection(): session.commit() return jsonify({'id': collection.id}), 200 - except: + except Exception: session.rollback() return jsonify(message='Failed to create collection'), 500 else: @@ -156,7 +156,7 @@ def put_page(): session.commit() session.refresh(page) return jsonify({'id': page.id}), 200 - except: + except Exception: session.rollback() return jsonify(message='Failed to create page'), 500 else: