Merge branch 'master' of /~https://github.com/deepset-ai/haystack into …

…private_key_embedding_model
deepset-ai · Mar 30, 2022 · 4c0668b · 4c0668b
2 parents ac4b291 + eb514a6
commit 4c0668b
Show file tree

Hide file tree

Showing 812 changed files with 38,364 additions and 32,667 deletions.
diff --git a/.github/release.yml b/.github/release.yml
@@ -1,7 +1,7 @@
 changelog:
   exclude:
     labels:
-      - ignore-for-release
+      - ignore-for-release-notes
   categories:
     - title: Breaking Changes
       labels:
@@ -17,14 +17,16 @@ changelog:
         - topic:document_store
         - topic:elasticsearch
         - topic:faiss
+        - topic:milvus
+        - topic:weaviate
+        - topic:pinecone
         - topic:sql
     - title: REST API
       labels:
         - topic:api
     - title: UI / Demo
       labels:
         - topic:ui
-        - topic:demo
     - title: Documentation
       labels:
         - type:documentation

diff --git a/.github/utils/generate_json_schema.py b/.github/utils/generate_json_schema.py
@@ -1,315 +1,13 @@
-import json
+import sys
 import logging
-import subprocess
 from pathlib import Path
-from typing import Any, Dict, Optional, Set, Tuple
 
-from haystack import __version__
-import haystack.document_stores
-import haystack.nodes
-import pydantic.schema
-from fastapi.dependencies.utils import get_typed_signature
-from pydantic import BaseConfig, BaseSettings, Required, SecretStr, create_model
-from pydantic.fields import ModelField
-from pydantic.schema import SkipField, TypeModelOrEnum, TypeModelSet, encode_default
-from pydantic.schema import field_singleton_schema as _field_singleton_schema
-from pydantic.typing import is_callable_type
-from pydantic.utils import lenient_issubclass
+logging.basicConfig(level=logging.INFO)
 
-schema_version = __version__
-filename = f"haystack-pipeline-{schema_version}.schema.json"
-destination_path = Path(__file__).parent.parent.parent / "json-schemas" / filename
 
+sys.path.append(".")
+from haystack.nodes._json_schema import update_json_schema
 
-class Settings(BaseSettings):
-    input_token: SecretStr
-    github_repository: str
-
-
-# Monkey patch Pydantic's field_singleton_schema to convert classes and functions to
-# strings in JSON Schema
-def field_singleton_schema(
-    field: ModelField,
-    *,
-    by_alias: bool,
-    model_name_map: Dict[TypeModelOrEnum, str],
-    ref_template: str,
-    schema_overrides: bool = False,
-    ref_prefix: Optional[str] = None,
-    known_models: TypeModelSet,
-) -> Tuple[Dict[str, Any], Dict[str, Any], Set[str]]:
-    try:
-        return _field_singleton_schema(
-            field,
-            by_alias=by_alias,
-            model_name_map=model_name_map,
-            ref_template=ref_template,
-            schema_overrides=schema_overrides,
-            ref_prefix=ref_prefix,
-            known_models=known_models,
-        )
-    except (ValueError, SkipField):
-        schema: Dict[str, Any] = {"type": "string"}
-
-        if isinstance(field.default, type) or is_callable_type(field.default):
-            default = field.default.__name__
-        else:
-            default = field.default
-        if not field.required:
-            schema["default"] = encode_default(default)
-        return schema, {}, set()
-
-
-# Monkeypatch Pydantic's field_singleton_schema
-pydantic.schema.field_singleton_schema = field_singleton_schema
-
-
-class Config(BaseConfig):
-    extra = "forbid"
-
-
-def get_json_schema():
-    """
-    Generate JSON schema for Haystack pipelines.
-    """
-    schema_definitions = {}
-    additional_definitions = {}
-
-    modules_with_nodes = [haystack.nodes, haystack.document_stores]
-    possible_nodes = []
-    for module in modules_with_nodes:
-        for importable_name in dir(module):
-            imported = getattr(module, importable_name)
-            possible_nodes.append((module, imported))
-    # TODO: decide if there's a better way to not include Base classes other than by
-    # the prefix "Base" in the name. Maybe it could make sense to have a list of
-    # all the valid nodes to include in the main source code and then using that here.
-    for module, node in possible_nodes:
-        if lenient_issubclass(node, haystack.nodes.BaseComponent) and not node.__name__.startswith("Base"):
-            logging.info(f"Processing node: {node.__name__}")
-            init_method = getattr(node, "__init__", None)
-            if init_method:
-                signature = get_typed_signature(init_method)
-                param_fields = [
-                    param
-                    for param in signature.parameters.values()
-                    if param.kind not in {param.VAR_POSITIONAL, param.VAR_KEYWORD}
-                ]
-                # Remove self parameter
-                param_fields.pop(0)
-                param_fields_kwargs: Dict[str, Any] = {}
-                for param in param_fields:
-                    logging.info(f"--- processing param: {param.name}")
-                    annotation = Any
-                    if param.annotation != param.empty:
-                        annotation = param.annotation
-                    default = Required
-                    if param.default != param.empty:
-                        default = param.default
-                    param_fields_kwargs[param.name] = (annotation, default)
-                model = create_model(
-                    f"{node.__name__}ComponentParams",
-                    __config__=Config,
-                    **param_fields_kwargs,
-                )
-                model.update_forward_refs(**model.__dict__)
-                params_schema = model.schema()
-                params_schema["title"] = "Parameters"
-                params_schema[
-                    "description"
-                ] = "Each parameter can reference other components defined in the same YAML file."
-                if "definitions" in params_schema:
-                    params_definitions = params_schema.pop("definitions")
-                    additional_definitions.update(params_definitions)
-                component_schema = {
-                    "type": "object",
-                    "properties": {
-                        "name": {
-                            "title": "Name",
-                            "description": "Custom name for the component. Helpful for visualization and debugging.",
-                            "type": "string",
-                        },
-                        "type": {
-                            "title": "Type",
-                            "description": "Haystack Class name for the component.",
-                            "type": "string",
-                            "const": f"{node.__name__}",
-                        },
-                        "params": params_schema,
-                    },
-                    "required": ["type", "name"],
-                    "additionalProperties": False,
-                }
-                schema_definitions[f"{node.__name__}Component"] = component_schema
-
-    all_definitions = {**schema_definitions, **additional_definitions}
-    component_refs = [{"$ref": f"#/definitions/{name}"} for name in schema_definitions]
-    pipeline_schema = {
-        "$schema": "http://json-schema.org/draft-07/schema",
-        "$id": f"https://haystack.deepset.ai/json-schemas/{filename}",
-        "title": "Haystack Pipeline",
-        "description": "Haystack Pipeline YAML file describing the nodes of the pipelines. For more info read the docs at: https://haystack.deepset.ai/components/pipelines#yaml-file-definitions",
-        "type": "object",
-        "properties": {
-            "version": {
-                "title": "Version",
-                "description": "Version of the Haystack Pipeline file.",
-                "type": "string",
-                "const": schema_version,
-            },
-            "components": {
-                "title": "Components",
-                "description": "Component nodes and their configurations, to later be used in the pipelines section. Define here all the building blocks for the pipelines.",
-                "type": "array",
-                "items": {"anyOf": component_refs},
-                "required": ["type", "name"],
-                "additionalProperties": False,
-            },
-            "pipelines": {
-                "title": "Pipelines",
-                "description": "Multiple pipelines can be defined using the components from the same YAML file.",
-                "type": "array",
-                "items": {
-                    "type": "object",
-                    "properties": {
-                        "name": {
-                            "title": "Name",
-                            "description": "Name of the pipeline.",
-                            "type": "string",
-                        },
-                        "nodes": {
-                            "title": "Nodes",
-                            "description": "Nodes to be used by this particular pipeline",
-                            "type": "array",
-                            "items": {
-                                "type": "object",
-                                "properties": {
-                                    "name": {
-                                        "title": "Name",
-                                        "description": "The name of this particular node in the pipeline. This should be one of the names from the components defined in the same file.",
-                                        "type": "string",
-                                    },
-                                    "inputs": {
-                                        "title": "Inputs",
-                                        "description": "Input parameters for this node.",
-                                        "type": "array",
-                                        "items": {"type": "string"},
-                                    },
-                                },
-                                "additionalProperties": False,
-                            },
-                            "required": ["name", "nodes"],
-                            "additionalProperties": False,
-                        },
-                    },
-                    "additionalProperties": False,
-                },
-            },
-        },
-        "required": ["version", "components", "pipelines"],
-        "additionalProperties": False,
-        "definitions": all_definitions,
-    }
-    return pipeline_schema
-
-
-def list_indexed_versions(index):
-    """
-    Given the schema index as a parsed JSON,
-    return a list of all the versions it contains.
-    """
-    indexed_versions = []
-    for version_entry in index["oneOf"]:
-        for property_entry in version_entry["allOf"]:
-            if "properties" in property_entry.keys():
-                indexed_versions.append(property_entry["properties"]["version"]["const"])
-    return indexed_versions
-
-
-def cleanup_rc_versions(index):
-    """
-    Given the schema index as a parsed JSON,
-    removes any existing (unstable) rc version from it.
-    """
-    new_versions_list = []
-    for version_entry in index["oneOf"]:
-        for property_entry in version_entry["allOf"]:
-            if "properties" in property_entry.keys():
-                if "rc" not in property_entry["properties"]["version"]["const"]:
-                    new_versions_list.append(version_entry)
-                    break
-    index["oneOf"] = new_versions_list
-    return index
-
-
-def new_version_entry(version):
-    """
-    Returns a new entry for the version index JSON schema.
-    """
-    return {
-        "allOf": [
-            {"properties": {"version": {"const": version}}},
-            {
-                "$ref": "https://raw.githubusercontent.com/deepset-ai/haystack/master/json-schemas/"
-                f"haystack-pipeline-{version}.schema.json"
-            },
-        ]
-    }
-
-
-def generate_json_schema():
-    # Create new schema file
-    pipeline_schema = get_json_schema()
-    destination_path.parent.mkdir(parents=True, exist_ok=True)
-    destination_path.write_text(json.dumps(pipeline_schema, indent=2))
-
-    # Update schema index
-    index = []
-    index_path = Path(__file__).parent.parent.parent / "json-schemas" / "haystack-pipeline.schema.json"
-    with open(index_path, "r") as index_file:
-        index = json.load(index_file)
-    if index:
-        index = cleanup_rc_versions(index)
-        indexed_versions = list_indexed_versions(index)
-        if not any(version == schema_version for version in indexed_versions):
-            index["oneOf"].append(new_version_entry(schema_version))
-            with open(index_path, "w") as index_file:
-                json.dump(index, index_file, indent=4)
-
-
-def main():
-    from github import Github
-
-    generate_json_schema()
-    logging.basicConfig(level=logging.INFO)
-    settings = Settings()
-    logging.info(f"Using config: {settings.json()}")
-    g = Github(settings.input_token.get_secret_value())
-    repo = g.get_repo(settings.github_repository)
-
-    logging.info("Setting up GitHub Actions git user")
-    subprocess.run(["git", "config", "user.name", "github-actions"], check=True)
-    subprocess.run(["git", "config", "user.email", "github-actions@github.com"], check=True)
-    branch_name = "generate-json-schema"
-    logging.info(f"Creating a new branch {branch_name}")
-    subprocess.run(["git", "checkout", "-b", branch_name], check=True)
-    logging.info("Adding updated file")
-    subprocess.run(["git", "add", str(destination_path)], check=True)
-    logging.info("Committing updated file")
-    message = "⬆ Upgrade JSON Schema file"
-    subprocess.run(["git", "commit", "-m", message], check=True)
-    logging.info("Pushing branch")
-    subprocess.run(["git", "push", "origin", branch_name], check=True)
-    logging.info("Creating PR")
-    pr = repo.create_pull(title=message, body=message, base="master", head=branch_name)
-    logging.info(f"Created PR: {pr.number}")
-    logging.info("Finished")
-
-
-if __name__ == "__main__":
-    # If you only want to generate the JSON Schema file without submitting a PR
-    # uncomment this line:
-    generate_json_schema()
-
-    # and comment this line:
-    # main()
+update_json_schema(
+    update_index=True, destination_path=Path(__file__).parent.parent.parent / "haystack" / "json-schemas"
+)
diff --git a/.github/utils/generate_openapi_specs.py b/.github/utils/generate_openapi_specs.py
@@ -0,0 +1,32 @@
+import json
+from pathlib import Path
+import os
+import sys
+import shutil
+
+REST_PATH = Path("./rest_api").absolute()
+PIPELINE_PATH = str(REST_PATH / "pipeline" / "pipeline_empty.haystack-pipeline.yml")
+APP_PATH = str(REST_PATH / "application.py")
+DOCS_PATH = Path("./docs") / "_src" / "api" / "openapi"
+
+os.environ["PIPELINE_YAML_PATH"] = PIPELINE_PATH
+
+print(f"Loading OpenAPI specs from {APP_PATH} with pipeline at {PIPELINE_PATH}")
+
+sys.path.append(".")
+from rest_api.application import get_openapi_specs, haystack_version
+
+# Generate the openapi specs
+specs = get_openapi_specs()
+
+# Dump the specs into a JSON file
+with open(DOCS_PATH / "openapi.json", "w") as f:
+    json.dump(specs, f, indent=4)
+
+# Remove rc versions of the specs from the folder
+for specs_file in os.listdir():
+    if os.path.isfile(specs_file) and "rc" in specs_file and Path(specs_file).suffix == ".json":
+        os.remove(specs_file)
+
+# Add versioned copy
+shutil.copy(DOCS_PATH / "openapi.json", DOCS_PATH / f"openapi-{haystack_version}.json")