-
Notifications
You must be signed in to change notification settings - Fork 2.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'master' of /~https://github.com/deepset-ai/haystack into …
…private_key_embedding_model
- Loading branch information
Showing
812 changed files
with
38,364 additions
and
32,667 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,315 +1,13 @@ | ||
import json | ||
import sys | ||
import logging | ||
import subprocess | ||
from pathlib import Path | ||
from typing import Any, Dict, Optional, Set, Tuple | ||
|
||
from haystack import __version__ | ||
import haystack.document_stores | ||
import haystack.nodes | ||
import pydantic.schema | ||
from fastapi.dependencies.utils import get_typed_signature | ||
from pydantic import BaseConfig, BaseSettings, Required, SecretStr, create_model | ||
from pydantic.fields import ModelField | ||
from pydantic.schema import SkipField, TypeModelOrEnum, TypeModelSet, encode_default | ||
from pydantic.schema import field_singleton_schema as _field_singleton_schema | ||
from pydantic.typing import is_callable_type | ||
from pydantic.utils import lenient_issubclass | ||
logging.basicConfig(level=logging.INFO) | ||
|
||
schema_version = __version__ | ||
filename = f"haystack-pipeline-{schema_version}.schema.json" | ||
destination_path = Path(__file__).parent.parent.parent / "json-schemas" / filename | ||
|
||
sys.path.append(".") | ||
from haystack.nodes._json_schema import update_json_schema | ||
|
||
class Settings(BaseSettings): | ||
input_token: SecretStr | ||
github_repository: str | ||
|
||
|
||
# Monkey patch Pydantic's field_singleton_schema to convert classes and functions to | ||
# strings in JSON Schema | ||
def field_singleton_schema( | ||
field: ModelField, | ||
*, | ||
by_alias: bool, | ||
model_name_map: Dict[TypeModelOrEnum, str], | ||
ref_template: str, | ||
schema_overrides: bool = False, | ||
ref_prefix: Optional[str] = None, | ||
known_models: TypeModelSet, | ||
) -> Tuple[Dict[str, Any], Dict[str, Any], Set[str]]: | ||
try: | ||
return _field_singleton_schema( | ||
field, | ||
by_alias=by_alias, | ||
model_name_map=model_name_map, | ||
ref_template=ref_template, | ||
schema_overrides=schema_overrides, | ||
ref_prefix=ref_prefix, | ||
known_models=known_models, | ||
) | ||
except (ValueError, SkipField): | ||
schema: Dict[str, Any] = {"type": "string"} | ||
|
||
if isinstance(field.default, type) or is_callable_type(field.default): | ||
default = field.default.__name__ | ||
else: | ||
default = field.default | ||
if not field.required: | ||
schema["default"] = encode_default(default) | ||
return schema, {}, set() | ||
|
||
|
||
# Monkeypatch Pydantic's field_singleton_schema | ||
pydantic.schema.field_singleton_schema = field_singleton_schema | ||
|
||
|
||
class Config(BaseConfig): | ||
extra = "forbid" | ||
|
||
|
||
def get_json_schema(): | ||
""" | ||
Generate JSON schema for Haystack pipelines. | ||
""" | ||
schema_definitions = {} | ||
additional_definitions = {} | ||
|
||
modules_with_nodes = [haystack.nodes, haystack.document_stores] | ||
possible_nodes = [] | ||
for module in modules_with_nodes: | ||
for importable_name in dir(module): | ||
imported = getattr(module, importable_name) | ||
possible_nodes.append((module, imported)) | ||
# TODO: decide if there's a better way to not include Base classes other than by | ||
# the prefix "Base" in the name. Maybe it could make sense to have a list of | ||
# all the valid nodes to include in the main source code and then using that here. | ||
for module, node in possible_nodes: | ||
if lenient_issubclass(node, haystack.nodes.BaseComponent) and not node.__name__.startswith("Base"): | ||
logging.info(f"Processing node: {node.__name__}") | ||
init_method = getattr(node, "__init__", None) | ||
if init_method: | ||
signature = get_typed_signature(init_method) | ||
param_fields = [ | ||
param | ||
for param in signature.parameters.values() | ||
if param.kind not in {param.VAR_POSITIONAL, param.VAR_KEYWORD} | ||
] | ||
# Remove self parameter | ||
param_fields.pop(0) | ||
param_fields_kwargs: Dict[str, Any] = {} | ||
for param in param_fields: | ||
logging.info(f"--- processing param: {param.name}") | ||
annotation = Any | ||
if param.annotation != param.empty: | ||
annotation = param.annotation | ||
default = Required | ||
if param.default != param.empty: | ||
default = param.default | ||
param_fields_kwargs[param.name] = (annotation, default) | ||
model = create_model( | ||
f"{node.__name__}ComponentParams", | ||
__config__=Config, | ||
**param_fields_kwargs, | ||
) | ||
model.update_forward_refs(**model.__dict__) | ||
params_schema = model.schema() | ||
params_schema["title"] = "Parameters" | ||
params_schema[ | ||
"description" | ||
] = "Each parameter can reference other components defined in the same YAML file." | ||
if "definitions" in params_schema: | ||
params_definitions = params_schema.pop("definitions") | ||
additional_definitions.update(params_definitions) | ||
component_schema = { | ||
"type": "object", | ||
"properties": { | ||
"name": { | ||
"title": "Name", | ||
"description": "Custom name for the component. Helpful for visualization and debugging.", | ||
"type": "string", | ||
}, | ||
"type": { | ||
"title": "Type", | ||
"description": "Haystack Class name for the component.", | ||
"type": "string", | ||
"const": f"{node.__name__}", | ||
}, | ||
"params": params_schema, | ||
}, | ||
"required": ["type", "name"], | ||
"additionalProperties": False, | ||
} | ||
schema_definitions[f"{node.__name__}Component"] = component_schema | ||
|
||
all_definitions = {**schema_definitions, **additional_definitions} | ||
component_refs = [{"$ref": f"#/definitions/{name}"} for name in schema_definitions] | ||
pipeline_schema = { | ||
"$schema": "http://json-schema.org/draft-07/schema", | ||
"$id": f"https://haystack.deepset.ai/json-schemas/{filename}", | ||
"title": "Haystack Pipeline", | ||
"description": "Haystack Pipeline YAML file describing the nodes of the pipelines. For more info read the docs at: https://haystack.deepset.ai/components/pipelines#yaml-file-definitions", | ||
"type": "object", | ||
"properties": { | ||
"version": { | ||
"title": "Version", | ||
"description": "Version of the Haystack Pipeline file.", | ||
"type": "string", | ||
"const": schema_version, | ||
}, | ||
"components": { | ||
"title": "Components", | ||
"description": "Component nodes and their configurations, to later be used in the pipelines section. Define here all the building blocks for the pipelines.", | ||
"type": "array", | ||
"items": {"anyOf": component_refs}, | ||
"required": ["type", "name"], | ||
"additionalProperties": False, | ||
}, | ||
"pipelines": { | ||
"title": "Pipelines", | ||
"description": "Multiple pipelines can be defined using the components from the same YAML file.", | ||
"type": "array", | ||
"items": { | ||
"type": "object", | ||
"properties": { | ||
"name": { | ||
"title": "Name", | ||
"description": "Name of the pipeline.", | ||
"type": "string", | ||
}, | ||
"nodes": { | ||
"title": "Nodes", | ||
"description": "Nodes to be used by this particular pipeline", | ||
"type": "array", | ||
"items": { | ||
"type": "object", | ||
"properties": { | ||
"name": { | ||
"title": "Name", | ||
"description": "The name of this particular node in the pipeline. This should be one of the names from the components defined in the same file.", | ||
"type": "string", | ||
}, | ||
"inputs": { | ||
"title": "Inputs", | ||
"description": "Input parameters for this node.", | ||
"type": "array", | ||
"items": {"type": "string"}, | ||
}, | ||
}, | ||
"additionalProperties": False, | ||
}, | ||
"required": ["name", "nodes"], | ||
"additionalProperties": False, | ||
}, | ||
}, | ||
"additionalProperties": False, | ||
}, | ||
}, | ||
}, | ||
"required": ["version", "components", "pipelines"], | ||
"additionalProperties": False, | ||
"definitions": all_definitions, | ||
} | ||
return pipeline_schema | ||
|
||
|
||
def list_indexed_versions(index): | ||
""" | ||
Given the schema index as a parsed JSON, | ||
return a list of all the versions it contains. | ||
""" | ||
indexed_versions = [] | ||
for version_entry in index["oneOf"]: | ||
for property_entry in version_entry["allOf"]: | ||
if "properties" in property_entry.keys(): | ||
indexed_versions.append(property_entry["properties"]["version"]["const"]) | ||
return indexed_versions | ||
|
||
|
||
def cleanup_rc_versions(index): | ||
""" | ||
Given the schema index as a parsed JSON, | ||
removes any existing (unstable) rc version from it. | ||
""" | ||
new_versions_list = [] | ||
for version_entry in index["oneOf"]: | ||
for property_entry in version_entry["allOf"]: | ||
if "properties" in property_entry.keys(): | ||
if "rc" not in property_entry["properties"]["version"]["const"]: | ||
new_versions_list.append(version_entry) | ||
break | ||
index["oneOf"] = new_versions_list | ||
return index | ||
|
||
|
||
def new_version_entry(version): | ||
""" | ||
Returns a new entry for the version index JSON schema. | ||
""" | ||
return { | ||
"allOf": [ | ||
{"properties": {"version": {"const": version}}}, | ||
{ | ||
"$ref": "https://raw.githubusercontent.com/deepset-ai/haystack/master/json-schemas/" | ||
f"haystack-pipeline-{version}.schema.json" | ||
}, | ||
] | ||
} | ||
|
||
|
||
def generate_json_schema(): | ||
# Create new schema file | ||
pipeline_schema = get_json_schema() | ||
destination_path.parent.mkdir(parents=True, exist_ok=True) | ||
destination_path.write_text(json.dumps(pipeline_schema, indent=2)) | ||
|
||
# Update schema index | ||
index = [] | ||
index_path = Path(__file__).parent.parent.parent / "json-schemas" / "haystack-pipeline.schema.json" | ||
with open(index_path, "r") as index_file: | ||
index = json.load(index_file) | ||
if index: | ||
index = cleanup_rc_versions(index) | ||
indexed_versions = list_indexed_versions(index) | ||
if not any(version == schema_version for version in indexed_versions): | ||
index["oneOf"].append(new_version_entry(schema_version)) | ||
with open(index_path, "w") as index_file: | ||
json.dump(index, index_file, indent=4) | ||
|
||
|
||
def main(): | ||
from github import Github | ||
|
||
generate_json_schema() | ||
logging.basicConfig(level=logging.INFO) | ||
settings = Settings() | ||
logging.info(f"Using config: {settings.json()}") | ||
g = Github(settings.input_token.get_secret_value()) | ||
repo = g.get_repo(settings.github_repository) | ||
|
||
logging.info("Setting up GitHub Actions git user") | ||
subprocess.run(["git", "config", "user.name", "github-actions"], check=True) | ||
subprocess.run(["git", "config", "user.email", "github-actions@github.com"], check=True) | ||
branch_name = "generate-json-schema" | ||
logging.info(f"Creating a new branch {branch_name}") | ||
subprocess.run(["git", "checkout", "-b", branch_name], check=True) | ||
logging.info("Adding updated file") | ||
subprocess.run(["git", "add", str(destination_path)], check=True) | ||
logging.info("Committing updated file") | ||
message = "⬆ Upgrade JSON Schema file" | ||
subprocess.run(["git", "commit", "-m", message], check=True) | ||
logging.info("Pushing branch") | ||
subprocess.run(["git", "push", "origin", branch_name], check=True) | ||
logging.info("Creating PR") | ||
pr = repo.create_pull(title=message, body=message, base="master", head=branch_name) | ||
logging.info(f"Created PR: {pr.number}") | ||
logging.info("Finished") | ||
|
||
|
||
if __name__ == "__main__": | ||
# If you only want to generate the JSON Schema file without submitting a PR | ||
# uncomment this line: | ||
generate_json_schema() | ||
|
||
# and comment this line: | ||
# main() | ||
update_json_schema( | ||
update_index=True, destination_path=Path(__file__).parent.parent.parent / "haystack" / "json-schemas" | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
import json | ||
from pathlib import Path | ||
import os | ||
import sys | ||
import shutil | ||
|
||
REST_PATH = Path("./rest_api").absolute() | ||
PIPELINE_PATH = str(REST_PATH / "pipeline" / "pipeline_empty.haystack-pipeline.yml") | ||
APP_PATH = str(REST_PATH / "application.py") | ||
DOCS_PATH = Path("./docs") / "_src" / "api" / "openapi" | ||
|
||
os.environ["PIPELINE_YAML_PATH"] = PIPELINE_PATH | ||
|
||
print(f"Loading OpenAPI specs from {APP_PATH} with pipeline at {PIPELINE_PATH}") | ||
|
||
sys.path.append(".") | ||
from rest_api.application import get_openapi_specs, haystack_version | ||
|
||
# Generate the openapi specs | ||
specs = get_openapi_specs() | ||
|
||
# Dump the specs into a JSON file | ||
with open(DOCS_PATH / "openapi.json", "w") as f: | ||
json.dump(specs, f, indent=4) | ||
|
||
# Remove rc versions of the specs from the folder | ||
for specs_file in os.listdir(): | ||
if os.path.isfile(specs_file) and "rc" in specs_file and Path(specs_file).suffix == ".json": | ||
os.remove(specs_file) | ||
|
||
# Add versioned copy | ||
shutil.copy(DOCS_PATH / "openapi.json", DOCS_PATH / f"openapi-{haystack_version}.json") |
Oops, something went wrong.