Skip to content

Commit

Permalink
Merge branch 'master' of /~https://github.com/deepset-ai/haystack into …
Browse files Browse the repository at this point in the history
…private_key_embedding_model
  • Loading branch information
MichelBartels committed Mar 30, 2022
2 parents ac4b291 + eb514a6 commit 4c0668b
Show file tree
Hide file tree
Showing 812 changed files with 38,364 additions and 32,667 deletions.
6 changes: 4 additions & 2 deletions .github/release.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
changelog:
exclude:
labels:
- ignore-for-release
- ignore-for-release-notes
categories:
- title: Breaking Changes
labels:
Expand All @@ -17,14 +17,16 @@ changelog:
- topic:document_store
- topic:elasticsearch
- topic:faiss
- topic:milvus
- topic:weaviate
- topic:pinecone
- topic:sql
- title: REST API
labels:
- topic:api
- title: UI / Demo
labels:
- topic:ui
- topic:demo
- title: Documentation
labels:
- type:documentation
Expand Down
316 changes: 7 additions & 309 deletions .github/utils/generate_json_schema.py
Original file line number Diff line number Diff line change
@@ -1,315 +1,13 @@
import json
import sys
import logging
import subprocess
from pathlib import Path
from typing import Any, Dict, Optional, Set, Tuple

from haystack import __version__
import haystack.document_stores
import haystack.nodes
import pydantic.schema
from fastapi.dependencies.utils import get_typed_signature
from pydantic import BaseConfig, BaseSettings, Required, SecretStr, create_model
from pydantic.fields import ModelField
from pydantic.schema import SkipField, TypeModelOrEnum, TypeModelSet, encode_default
from pydantic.schema import field_singleton_schema as _field_singleton_schema
from pydantic.typing import is_callable_type
from pydantic.utils import lenient_issubclass
logging.basicConfig(level=logging.INFO)

schema_version = __version__
filename = f"haystack-pipeline-{schema_version}.schema.json"
destination_path = Path(__file__).parent.parent.parent / "json-schemas" / filename

sys.path.append(".")
from haystack.nodes._json_schema import update_json_schema

class Settings(BaseSettings):
input_token: SecretStr
github_repository: str


# Monkey patch Pydantic's field_singleton_schema to convert classes and functions to
# strings in JSON Schema
def field_singleton_schema(
field: ModelField,
*,
by_alias: bool,
model_name_map: Dict[TypeModelOrEnum, str],
ref_template: str,
schema_overrides: bool = False,
ref_prefix: Optional[str] = None,
known_models: TypeModelSet,
) -> Tuple[Dict[str, Any], Dict[str, Any], Set[str]]:
try:
return _field_singleton_schema(
field,
by_alias=by_alias,
model_name_map=model_name_map,
ref_template=ref_template,
schema_overrides=schema_overrides,
ref_prefix=ref_prefix,
known_models=known_models,
)
except (ValueError, SkipField):
schema: Dict[str, Any] = {"type": "string"}

if isinstance(field.default, type) or is_callable_type(field.default):
default = field.default.__name__
else:
default = field.default
if not field.required:
schema["default"] = encode_default(default)
return schema, {}, set()


# Monkeypatch Pydantic's field_singleton_schema
pydantic.schema.field_singleton_schema = field_singleton_schema


class Config(BaseConfig):
extra = "forbid"


def get_json_schema():
"""
Generate JSON schema for Haystack pipelines.
"""
schema_definitions = {}
additional_definitions = {}

modules_with_nodes = [haystack.nodes, haystack.document_stores]
possible_nodes = []
for module in modules_with_nodes:
for importable_name in dir(module):
imported = getattr(module, importable_name)
possible_nodes.append((module, imported))
# TODO: decide if there's a better way to not include Base classes other than by
# the prefix "Base" in the name. Maybe it could make sense to have a list of
# all the valid nodes to include in the main source code and then using that here.
for module, node in possible_nodes:
if lenient_issubclass(node, haystack.nodes.BaseComponent) and not node.__name__.startswith("Base"):
logging.info(f"Processing node: {node.__name__}")
init_method = getattr(node, "__init__", None)
if init_method:
signature = get_typed_signature(init_method)
param_fields = [
param
for param in signature.parameters.values()
if param.kind not in {param.VAR_POSITIONAL, param.VAR_KEYWORD}
]
# Remove self parameter
param_fields.pop(0)
param_fields_kwargs: Dict[str, Any] = {}
for param in param_fields:
logging.info(f"--- processing param: {param.name}")
annotation = Any
if param.annotation != param.empty:
annotation = param.annotation
default = Required
if param.default != param.empty:
default = param.default
param_fields_kwargs[param.name] = (annotation, default)
model = create_model(
f"{node.__name__}ComponentParams",
__config__=Config,
**param_fields_kwargs,
)
model.update_forward_refs(**model.__dict__)
params_schema = model.schema()
params_schema["title"] = "Parameters"
params_schema[
"description"
] = "Each parameter can reference other components defined in the same YAML file."
if "definitions" in params_schema:
params_definitions = params_schema.pop("definitions")
additional_definitions.update(params_definitions)
component_schema = {
"type": "object",
"properties": {
"name": {
"title": "Name",
"description": "Custom name for the component. Helpful for visualization and debugging.",
"type": "string",
},
"type": {
"title": "Type",
"description": "Haystack Class name for the component.",
"type": "string",
"const": f"{node.__name__}",
},
"params": params_schema,
},
"required": ["type", "name"],
"additionalProperties": False,
}
schema_definitions[f"{node.__name__}Component"] = component_schema

all_definitions = {**schema_definitions, **additional_definitions}
component_refs = [{"$ref": f"#/definitions/{name}"} for name in schema_definitions]
pipeline_schema = {
"$schema": "http://json-schema.org/draft-07/schema",
"$id": f"https://haystack.deepset.ai/json-schemas/{filename}",
"title": "Haystack Pipeline",
"description": "Haystack Pipeline YAML file describing the nodes of the pipelines. For more info read the docs at: https://haystack.deepset.ai/components/pipelines#yaml-file-definitions",
"type": "object",
"properties": {
"version": {
"title": "Version",
"description": "Version of the Haystack Pipeline file.",
"type": "string",
"const": schema_version,
},
"components": {
"title": "Components",
"description": "Component nodes and their configurations, to later be used in the pipelines section. Define here all the building blocks for the pipelines.",
"type": "array",
"items": {"anyOf": component_refs},
"required": ["type", "name"],
"additionalProperties": False,
},
"pipelines": {
"title": "Pipelines",
"description": "Multiple pipelines can be defined using the components from the same YAML file.",
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"title": "Name",
"description": "Name of the pipeline.",
"type": "string",
},
"nodes": {
"title": "Nodes",
"description": "Nodes to be used by this particular pipeline",
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"title": "Name",
"description": "The name of this particular node in the pipeline. This should be one of the names from the components defined in the same file.",
"type": "string",
},
"inputs": {
"title": "Inputs",
"description": "Input parameters for this node.",
"type": "array",
"items": {"type": "string"},
},
},
"additionalProperties": False,
},
"required": ["name", "nodes"],
"additionalProperties": False,
},
},
"additionalProperties": False,
},
},
},
"required": ["version", "components", "pipelines"],
"additionalProperties": False,
"definitions": all_definitions,
}
return pipeline_schema


def list_indexed_versions(index):
"""
Given the schema index as a parsed JSON,
return a list of all the versions it contains.
"""
indexed_versions = []
for version_entry in index["oneOf"]:
for property_entry in version_entry["allOf"]:
if "properties" in property_entry.keys():
indexed_versions.append(property_entry["properties"]["version"]["const"])
return indexed_versions


def cleanup_rc_versions(index):
"""
Given the schema index as a parsed JSON,
removes any existing (unstable) rc version from it.
"""
new_versions_list = []
for version_entry in index["oneOf"]:
for property_entry in version_entry["allOf"]:
if "properties" in property_entry.keys():
if "rc" not in property_entry["properties"]["version"]["const"]:
new_versions_list.append(version_entry)
break
index["oneOf"] = new_versions_list
return index


def new_version_entry(version):
"""
Returns a new entry for the version index JSON schema.
"""
return {
"allOf": [
{"properties": {"version": {"const": version}}},
{
"$ref": "https://raw.githubusercontent.com/deepset-ai/haystack/master/json-schemas/"
f"haystack-pipeline-{version}.schema.json"
},
]
}


def generate_json_schema():
# Create new schema file
pipeline_schema = get_json_schema()
destination_path.parent.mkdir(parents=True, exist_ok=True)
destination_path.write_text(json.dumps(pipeline_schema, indent=2))

# Update schema index
index = []
index_path = Path(__file__).parent.parent.parent / "json-schemas" / "haystack-pipeline.schema.json"
with open(index_path, "r") as index_file:
index = json.load(index_file)
if index:
index = cleanup_rc_versions(index)
indexed_versions = list_indexed_versions(index)
if not any(version == schema_version for version in indexed_versions):
index["oneOf"].append(new_version_entry(schema_version))
with open(index_path, "w") as index_file:
json.dump(index, index_file, indent=4)


def main():
from github import Github

generate_json_schema()
logging.basicConfig(level=logging.INFO)
settings = Settings()
logging.info(f"Using config: {settings.json()}")
g = Github(settings.input_token.get_secret_value())
repo = g.get_repo(settings.github_repository)

logging.info("Setting up GitHub Actions git user")
subprocess.run(["git", "config", "user.name", "github-actions"], check=True)
subprocess.run(["git", "config", "user.email", "github-actions@github.com"], check=True)
branch_name = "generate-json-schema"
logging.info(f"Creating a new branch {branch_name}")
subprocess.run(["git", "checkout", "-b", branch_name], check=True)
logging.info("Adding updated file")
subprocess.run(["git", "add", str(destination_path)], check=True)
logging.info("Committing updated file")
message = "⬆ Upgrade JSON Schema file"
subprocess.run(["git", "commit", "-m", message], check=True)
logging.info("Pushing branch")
subprocess.run(["git", "push", "origin", branch_name], check=True)
logging.info("Creating PR")
pr = repo.create_pull(title=message, body=message, base="master", head=branch_name)
logging.info(f"Created PR: {pr.number}")
logging.info("Finished")


if __name__ == "__main__":
# If you only want to generate the JSON Schema file without submitting a PR
# uncomment this line:
generate_json_schema()

# and comment this line:
# main()
update_json_schema(
update_index=True, destination_path=Path(__file__).parent.parent.parent / "haystack" / "json-schemas"
)
32 changes: 32 additions & 0 deletions .github/utils/generate_openapi_specs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import json
from pathlib import Path
import os
import sys
import shutil

REST_PATH = Path("./rest_api").absolute()
PIPELINE_PATH = str(REST_PATH / "pipeline" / "pipeline_empty.haystack-pipeline.yml")
APP_PATH = str(REST_PATH / "application.py")
DOCS_PATH = Path("./docs") / "_src" / "api" / "openapi"

os.environ["PIPELINE_YAML_PATH"] = PIPELINE_PATH

print(f"Loading OpenAPI specs from {APP_PATH} with pipeline at {PIPELINE_PATH}")

sys.path.append(".")
from rest_api.application import get_openapi_specs, haystack_version

# Generate the openapi specs
specs = get_openapi_specs()

# Dump the specs into a JSON file
with open(DOCS_PATH / "openapi.json", "w") as f:
json.dump(specs, f, indent=4)

# Remove rc versions of the specs from the folder
for specs_file in os.listdir():
if os.path.isfile(specs_file) and "rc" in specs_file and Path(specs_file).suffix == ".json":
os.remove(specs_file)

# Add versioned copy
shutil.copy(DOCS_PATH / "openapi.json", DOCS_PATH / f"openapi-{haystack_version}.json")
Loading

0 comments on commit 4c0668b

Please sign in to comment.