From 96b01a8012d164df7c24c460149d3b79ecad3901 Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Tue, 5 Jul 2022 17:08:52 +0200 Subject: [PATCH] Remove "bad characters" from our codebase (#24841) * Remove "bad characters" from our codebase We had plenty of "bad characters" in our codebase that were not invited and came here by accident. We want to get rid of those "bad characters" once and for all. --- .github/boring-cyborg.yml | 2 +- .pre-commit-config.yaml | 12 +- COMMITTERS.rst | 4 +- CONTRIBUTING.rst | 14 +- IMAGES.rst | 2 +- ISSUE_TRIAGE_PROCESS.rst | 20 +- RELEASE_NOTES.rst | 20 +- STATIC_CODE_CHECKS.rst | 4 +- airflow/cli/cli_parser.py | 4 +- .../apache/kylin/operators/kylin_cube.py | 12 +- .../databricks/operators/databricks.py | 2 +- airflow/providers/datadog/sensors/datadog.py | 4 +- .../providers/google/cloud/hooks/automl.py | 2 +- .../providers/google/cloud/hooks/pubsub.py | 2 +- .../google/cloud/operators/automl.py | 2 +- .../google/cloud/operators/pubsub.py | 2 +- .../google/cloud/operators/vision.py | 6 +- .../google/cloud/utils/field_validator.py | 4 +- .../providers/pagerduty/hooks/pagerduty.py | 2 +- .../pagerduty/hooks/pagerduty_events.py | 2 +- airflow/providers/qubole/operators/qubole.py | 4 +- airflow/providers/tableau/hooks/tableau.py | 4 +- airflow/settings.py | 4 +- airflow/www/forms.py | 2 +- chart/templates/cleanup/cleanup-cronjob.yaml | 2 +- dev/README_RELEASE_AIRFLOW.md | 8 +- dev/README_RELEASE_HELM_CHART.md | 2 +- dev/README_RELEASE_PROVIDER_PACKAGES.md | 2 +- .../src/airflow_breeze/pre_commit_ids.py | 2 +- docs/README.rst | 2 +- .../operators.rst | 2 +- .../connections/ftp.rst | 2 +- .../operators/cloud/datafusion.rst | 2 +- .../operators/cloud/mlengine.rst | 4 +- .../operators/cloud/pubsub.rst | 2 +- .../connections/sftp.rst | 4 +- .../connections/tableau.rst | 6 +- docs/apache-airflow/concepts/scheduler.rst | 6 +- .../apache-airflow/concepts/smart-sensors.rst | 14 +- docs/apache-airflow/dag-run.rst | 4 +- docs/apache-airflow/privacy_notice.rst | 2 +- docs/apache-airflow/production-deployment.rst | 2 +- docs/apache-airflow/release-process.rst | 4 +- .../security/secrets/fernet.rst | 2 +- docs/apache-airflow/timezone.rst | 22 +- docs/apache-airflow/usage-cli.rst | 2 +- docs/conf.py | 2 +- docs/docker-stack/build-arg-ref.rst | 2 +- .../docs_build/dev_index_template.html.jinja2 | 2 +- docs/helm-chart/quick-start.rst | 2 +- images/breeze/output-commands-hash.txt | 2 +- images/breeze/output-static-checks.svg | 220 +++++++++--------- .../pre_commit_replace_bad_characters.py | 73 ++++++ .../providers/papermill/input_notebook.ipynb | 2 +- 54 files changed, 308 insertions(+), 231 deletions(-) create mode 100755 scripts/ci/pre_commit/pre_commit_replace_bad_characters.py diff --git a/.github/boring-cyborg.yml b/.github/boring-cyborg.yml index 607d4fb6cdc55..ccd5259d5d190 100644 --- a/.github/boring-cyborg.yml +++ b/.github/boring-cyborg.yml @@ -216,7 +216,7 @@ firstPRWelcomeComment: > Consider adding an example DAG that shows how users should use it. - Consider using [Breeze environment](/~https://github.com/apache/airflow/blob/main/BREEZE.rst) for testing - locally, it’s a heavy docker but it ships with a working Airflow and a lot of integrations. + locally, it's a heavy docker but it ships with a working Airflow and a lot of integrations. - Be patient and persistent. It might take some time to get a review or get the final approval from Committers. diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1548fe07e510e..ae4b39b4bd6c3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -42,9 +42,6 @@ repos: - repo: /~https://github.com/Lucas-C/pre-commit-hooks rev: v1.2.0 hooks: - - id: forbid-tabs - name: Fail if tabs are used in the project - exclude: ^airflow/_vendor/|^clients/gen/go\.sh$|^\.gitmodules$ - id: insert-license name: Add license for all SQL files files: \.sql$ @@ -303,6 +300,13 @@ repos: - --exclude-file=.codespellignorelines - repo: local hooks: + - id: replace-bad-characters + name: Replace bad characters + entry: ./scripts/ci/pre_commit/pre_commit_replace_bad_characters.py + language: python + types: [file, text] + exclude: ^airflow/_vendor/|^clients/gen/go\.sh$|^\.gitmodules$ + additional_dependencies: ['rich>=12.4.4'] - id: static-check-autoflake name: Remove all unused code entry: autoflake --remove-all-unused-imports --ignore-init-module-imports --in-place @@ -353,7 +357,7 @@ repos: name: Update output of breeze commands in BREEZE.rst entry: ./scripts/ci/pre_commit/pre_commit_breeze_cmd_line.py language: python - files: ^BREEZE\.rst$|^dev/breeze/.*$ + files: ^BREEZE\.rst$|^dev/breeze/.*$|^\.pre-commit-config\.yml$ pass_filenames: false additional_dependencies: ['rich>=12.4.4', 'rich-click>=1.5'] - id: update-local-yml-file diff --git a/COMMITTERS.rst b/COMMITTERS.rst index 054988407cb60..d90bd23e199f6 100644 --- a/COMMITTERS.rst +++ b/COMMITTERS.rst @@ -27,7 +27,7 @@ Before reading this document, you should be familiar with `Contributor's guide < Guidelines to become an Airflow Committer ------------------------------------------ -Committers are community members who have write access to the project’s +Committers are community members who have write access to the project's repositories, i.e., they can modify the code, documentation, and website by themselves and also accept other contributions. There is no strict protocol for becoming a committer. Candidates for new committers are typically people that are active contributors and community members. @@ -77,7 +77,7 @@ Community contributions 1. Was instrumental in triaging issues 2. Improved documentation of Airflow in significant way -3. Lead change and improvements introduction in the “community” processes and tools +3. Lead change and improvements introduction in the "community" processes and tools 4. Actively spreads the word about Airflow, for example organising Airflow summit, workshops for community members, giving and recording talks, writing blogs 5. Reporting bugs with detailed reproduction steps diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 283e430aed3ca..3093bd35e76d6 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -134,7 +134,7 @@ and guidelines. Committers/Maintainers ---------------------- -Committers are community members that have write access to the project’s repositories, i.e., they can modify the code, +Committers are community members that have write access to the project's repositories, i.e., they can modify the code, documentation, and website by themselves and also accept other contributions. The official list of committers can be found `here `__. @@ -277,7 +277,7 @@ For effective collaboration, make sure to join the following Airflow groups: - Mailing lists: - - Developer’s mailing list ``_ + - Developer's mailing list ``_ (quite substantial traffic on this list) - All commits mailing list: ``_ @@ -1472,14 +1472,14 @@ Here are a few rules that are important to keep in mind when you enter our commu * There is a #newbie-questions channel in slack as a safe place to ask questions * You can ask one of the committers to be a mentor for you, committers can guide within the community * You can apply to more structured `Apache Mentoring Programme `_ -* It’s your responsibility as an author to take your PR from start-to-end including leading communication +* It's your responsibility as an author to take your PR from start-to-end including leading communication in the PR -* It’s your responsibility as an author to ping committers to review your PR - be mildly annoying sometimes, - it’s OK to be slightly annoying with your change - it is also a sign for committers that you care +* It's your responsibility as an author to ping committers to review your PR - be mildly annoying sometimes, + it's OK to be slightly annoying with your change - it is also a sign for committers that you care * Be considerate to the high code quality/test coverage requirements for Apache Airflow * If in doubt - ask the community for their opinion or propose to vote at the devlist * Discussions should concern subject matters - judge or criticise the merit but never criticise people -* It’s OK to express your own emotions while communicating - it helps other people to understand you +* It's OK to express your own emotions while communicating - it helps other people to understand you * Be considerate for feelings of others. Tell about how you feel not what you think of others Commit Policy @@ -1495,6 +1495,6 @@ and slightly modified and consensus reached in October 2020: Resources & Links ================= -- `Airflow’s official documentation `__ +- `Airflow's official documentation `__ - `More resources and links to Airflow related content on the Wiki `__ diff --git a/IMAGES.rst b/IMAGES.rst index 634c4ac8a0e12..2f9fc7234b13e 100644 --- a/IMAGES.rst +++ b/IMAGES.rst @@ -382,7 +382,7 @@ The following build arguments (``--build-arg`` in docker build command) can be u | ``HOME`` | ``/root`` | Home directory of the root user (CI | | | | image has root user as default) | +------------------------------------------+------------------------------------------+------------------------------------------+ -| ``AIRFLOW_HOME`` | ``/root/airflow`` | Airflow’s HOME (that’s where logs and | +| ``AIRFLOW_HOME`` | ``/root/airflow`` | Airflow's HOME (that's where logs and | | | | sqlite databases are stored) | +------------------------------------------+------------------------------------------+------------------------------------------+ | ``AIRFLOW_SOURCES`` | ``/opt/airflow`` | Mounted sources of Airflow | diff --git a/ISSUE_TRIAGE_PROCESS.rst b/ISSUE_TRIAGE_PROCESS.rst index 1588db78ad26d..f025fbb3330cd 100644 --- a/ISSUE_TRIAGE_PROCESS.rst +++ b/ISSUE_TRIAGE_PROCESS.rst @@ -44,16 +44,16 @@ to indicate the following elements: **Kind** -The “kind” labels indicate “what kind of issue it is”. The most -commonly used “kind” labels are: bug, feature, documentation, or task. +The "kind" labels indicate "what kind of issue it is". The most +commonly used "kind" labels are: bug, feature, documentation, or task. Therefore, when reporting an issue, the label of ``kind:bug`` is to indicate a problem with the functionality, whereas the label of ``kind:feature`` is a desire to extend the functionality. There has been discussion within the project about whether to separate -the desire for “new features” from “enhancements to existing features”, -but in practice most “feature requests” are actually enhancement requests, +the desire for "new features" from "enhancements to existing features", +but in practice most "feature requests" are actually enhancement requests, so we decided to combine them both into ``kind:feature``. The ``kind:task`` is used to categorize issues which are @@ -67,7 +67,7 @@ made to the documentation within the project. **Area** -The “area” set of labels should indicate the component of the code +The "area" set of labels should indicate the component of the code referenced by the issue. At a high level, the biggest areas of the project are: Airflow Core and Airflow Providers, which are referenced by ``area:core`` and ``area:providers``. This is especially important since these are now @@ -75,7 +75,7 @@ being released and versioned independently. There are more detailed areas of the Core Airflow project such as Scheduler, Webserver, API, UI, Logging, and Kubernetes, which are all conceptually under the -“Airflow Core” area of the project. +"Airflow Core" area of the project. Similarly within Airflow Providers, the larger providers such as Apache, AWS, Azure, and Google who have many hooks and operators within them, have labels directly @@ -116,7 +116,7 @@ Therefore, the priority labels used are: It's important to use priority labels effectively so we can triage incoming issues appropriately and make sure that when we release a new version of Airflow, -we can ship a release confident that there are no “production blocker” issues in it. +we can ship a release confident that there are no "production blocker" issues in it. This applies to both Core Airflow as well as the Airflow Providers. With the separation of the Providers release from Core Airflow, a ``priority:critical`` bug in a single @@ -175,13 +175,13 @@ Ideally, these issues only require one or two files to be changed. The intention here is that incremental changes to existing files are a lot easier for a new contributor as compared to adding something completely new. -Another possibility here is to add “how to fix” in the comments of such issues, so +Another possibility here is to add "how to fix" in the comments of such issues, so that new contributors have a running start when then pick up these issues. **Timeliness** -For the sake of quick responses, the general “soft" rule within the Airflow project +For the sake of quick responses, the general "soft" rule within the Airflow project is that if there is no assignee, anyone can take an issue to solve. However, this depends on timely resolution of the issue by the assignee. The @@ -203,6 +203,6 @@ At times issues are marked as invalid and later closed because of one of the following situations: * The issue is a duplicate of an already reported issue. In such cases, the latter issue is marked as ``duplicate``. -* Despite attempts to reproduce the issue to resolve it, the issue cannot be reproduced by the Airflow team based on the given information. In such cases, the issue is marked as ``Can’t Reproduce``. +* Despite attempts to reproduce the issue to resolve it, the issue cannot be reproduced by the Airflow team based on the given information. In such cases, the issue is marked as ``Can't Reproduce``. * In some cases, the original creator realizes that the issue was incorrectly reported and then marks it as ``invalid``. Also, a committer could mark it as ``invalid`` if the issue being reported is for an unsupported operation or environment. * In some cases, the issue may be legitimate, but may not be addressed in the short to medium term based on current project priorities or because this will be irrelevant because of an upcoming change. The committer could mark this as ``wontfix`` to set expectations that it won't be directly addressed in the near term. diff --git a/RELEASE_NOTES.rst b/RELEASE_NOTES.rst index 6f0a222704371..9816221f21dee 100644 --- a/RELEASE_NOTES.rst +++ b/RELEASE_NOTES.rst @@ -157,7 +157,7 @@ Continuing the effort to bind TaskInstance to a DagRun, XCom entries are now als Task log templates are now read from the metadata database instead of ``airflow.cfg`` (#20165) """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -Previously, a task’s log is dynamically rendered from the ``[core] log_filename_template`` and ``[elasticsearch] log_id_template`` config values at runtime. This resulted in unfortunate characteristics, e.g. it is impractical to modify the config value after an Airflow instance is running for a while, since all existing task logs have be saved under the previous format and cannot be found with the new config value. +Previously, a task's log is dynamically rendered from the ``[core] log_filename_template`` and ``[elasticsearch] log_id_template`` config values at runtime. This resulted in unfortunate characteristics, e.g. it is impractical to modify the config value after an Airflow instance is running for a while, since all existing task logs have be saved under the previous format and cannot be found with the new config value. A new ``log_template`` table is introduced to solve this problem. This table is synchronized with the aforementioned config values every time Airflow starts, and a new field ``log_template_id`` is added to every DAG run to point to the format used by tasks (``NULL`` indicates the first ever entry for compatibility). @@ -174,9 +174,9 @@ No change in behavior is expected. This was necessary in order to take advantag XCom now defined by ``run_id`` instead of ``execution_date`` (#20975) """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -As a continuation to the TaskInstance-DagRun relation change started in Airflow 2.2, the ``execution_date`` columns on XCom has been removed from the database, and replaced by an `association proxy `_ field at the ORM level. If you access Airflow’s metadata database directly, you should rewrite the implementation to use the ``run_id`` column instead. +As a continuation to the TaskInstance-DagRun relation change started in Airflow 2.2, the ``execution_date`` columns on XCom has been removed from the database, and replaced by an `association proxy `_ field at the ORM level. If you access Airflow's metadata database directly, you should rewrite the implementation to use the ``run_id`` column instead. -Note that Airflow’s metadatabase definition on both the database and ORM levels are considered implementation detail without strict backward compatibility guarantees. +Note that Airflow's metadatabase definition on both the database and ORM levels are considered implementation detail without strict backward compatibility guarantees. Non-JSON-serializable params deprecated (#21135). """"""""""""""""""""""""""""""""""""""""""""""""" @@ -224,7 +224,7 @@ This setting is also used for the deprecated experimental API, which only uses t ``airflow.models.base.Operator`` is removed (#21505) """""""""""""""""""""""""""""""""""""""""""""""""""" -Previously, there was an empty class ``airflow.models.base.Operator`` for “type hinting”. This class was never really useful for anything (everything it did could be done better with ``airflow.models.baseoperator.BaseOperator``), and has been removed. If you are relying on the class’s existence, use ``BaseOperator`` (for concrete operators), ``airflow.models.abstractoperator.AbstractOperator`` (the base class of both ``BaseOperator`` and the AIP-42 ``MappedOperator``), or ``airflow.models.operator.Operator`` (a union type ``BaseOperator | MappedOperator`` for type annotation). +Previously, there was an empty class ``airflow.models.base.Operator`` for "type hinting". This class was never really useful for anything (everything it did could be done better with ``airflow.models.baseoperator.BaseOperator``), and has been removed. If you are relying on the class's existence, use ``BaseOperator`` (for concrete operators), ``airflow.models.abstractoperator.AbstractOperator`` (the base class of both ``BaseOperator`` and the AIP-42 ``MappedOperator``), or ``airflow.models.operator.Operator`` (a union type ``BaseOperator | MappedOperator`` for type annotation). Zip files in the DAGs folder can no longer have a ``.py`` extension (#21538) """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" @@ -1235,9 +1235,9 @@ As part of this change the ``clean_tis_without_dagrun_interval`` config option u TaskInstance and TaskReschedule now define ``run_id`` instead of ``execution_date`` """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -As a part of the TaskInstance-DagRun relation change, the ``execution_date`` columns on TaskInstance and TaskReschedule have been removed from the database, and replaced by `association proxy `_ fields at the ORM level. If you access Airflow’s metadatabase directly, you should rewrite the implementation to use the ``run_id`` columns instead. +As a part of the TaskInstance-DagRun relation change, the ``execution_date`` columns on TaskInstance and TaskReschedule have been removed from the database, and replaced by `association proxy `_ fields at the ORM level. If you access Airflow's metadatabase directly, you should rewrite the implementation to use the ``run_id`` columns instead. -Note that Airflow’s metadatabase definition on both the database and ORM levels are considered implementation detail without strict backward compatibility guarantees. +Note that Airflow's metadatabase definition on both the database and ORM levels are considered implementation detail without strict backward compatibility guarantees. DaskExecutor - Dask Worker Resources and queues """"""""""""""""""""""""""""""""""""""""""""""" @@ -1247,9 +1247,9 @@ If dask workers are not started with complementary resources to match the specif Logical date of a DAG run triggered from the web UI now have its sub-second component set to zero """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -Due to a change in how the logical date (``execution_date``) is generated for a manual DAG run, a manual DAG run’s logical date may not match its time-of-trigger, but have its sub-second part zero-ed out. For example, a DAG run triggered on ``2021-10-11T12:34:56.78901`` would have its logical date set to ``2021-10-11T12:34:56.00000``. +Due to a change in how the logical date (``execution_date``) is generated for a manual DAG run, a manual DAG run's logical date may not match its time-of-trigger, but have its sub-second part zero-ed out. For example, a DAG run triggered on ``2021-10-11T12:34:56.78901`` would have its logical date set to ``2021-10-11T12:34:56.00000``. -This may affect some logic that expects on this quirk to detect whether a run is triggered manually or not. Note that ``dag_run.run_type`` is a more authoritative value for this purpose. Also, if you need this distinction between automated and manually-triggered run for “next execution date” calculation, please also consider using the new data interval variables instead, which provide a more consistent behavior between the two run types. +This may affect some logic that expects on this quirk to detect whether a run is triggered manually or not. Note that ``dag_run.run_type`` is a more authoritative value for this purpose. Also, if you need this distinction between automated and manually-triggered run for "next execution date" calculation, please also consider using the new data interval variables instead, which provide a more consistent behavior between the two run types. New Features ^^^^^^^^^^^^ @@ -8755,14 +8755,14 @@ A logger is the entry point into the logging system. Each logger is a named buck Each message that is written to the logger is a Log Record. Each log record contains a log level indicating the severity of that specific message. A log record can also contain useful metadata that describes the event that is being logged. This can include details such as a stack trace or an error code. -When a message is given to the logger, the log level of the message is compared to the log level of the logger. If the log level of the message meets or exceeds the log level of the logger itself, the message will undergo further processing. If it doesn’t, the message will be ignored. +When a message is given to the logger, the log level of the message is compared to the log level of the logger. If the log level of the message meets or exceeds the log level of the logger itself, the message will undergo further processing. If it doesn't, the message will be ignored. Once a logger has determined that a message needs to be processed, it is passed to a Handler. This configuration is now more flexible and can be easily be maintained in a single file. Changes in Airflow Logging ~~~~~~~~~~~~~~~~~~~~~~~~~~ -Airflow's logging mechanism has been refactored to use Python’s built-in ``logging`` module to perform logging of the application. By extending classes with the existing ``LoggingMixin``\ , all the logging will go through a central logger. Also the ``BaseHook`` and ``BaseOperator`` already extend this class, so it is easily available to do logging. +Airflow's logging mechanism has been refactored to use Python's built-in ``logging`` module to perform logging of the application. By extending classes with the existing ``LoggingMixin``\ , all the logging will go through a central logger. Also the ``BaseHook`` and ``BaseOperator`` already extend this class, so it is easily available to do logging. The main benefit is easier configuration of the logging by setting a single centralized python file. Disclaimer; there is still some inline configuration, but this will be removed eventually. The new logging class is defined by setting the dotted classpath in your ``~/airflow/airflow.cfg`` file: diff --git a/STATIC_CODE_CHECKS.rst b/STATIC_CODE_CHECKS.rst index 45d17dde1889d..966dd134f6b55 100644 --- a/STATIC_CODE_CHECKS.rst +++ b/STATIC_CODE_CHECKS.rst @@ -227,8 +227,6 @@ require Breeze Docker image to be build locally. +--------------------------------------------------------+------------------------------------------------------------------+---------+ | flynt | Run flynt string format converter for Python | | +--------------------------------------------------------+------------------------------------------------------------------+---------+ -| forbid-tabs | Fail if tabs are used in the project | | -+--------------------------------------------------------+------------------------------------------------------------------+---------+ | identity | Print input to the static check hooks for troubleshooting | | +--------------------------------------------------------+------------------------------------------------------------------+---------+ | insert-license | * Add license for all SQL files | | @@ -277,6 +275,8 @@ require Breeze Docker image to be build locally. +--------------------------------------------------------+------------------------------------------------------------------+---------+ | pyupgrade | Upgrade Python code automatically | | +--------------------------------------------------------+------------------------------------------------------------------+---------+ +| replace-bad-characters | Replace bad characters | | ++--------------------------------------------------------+------------------------------------------------------------------+---------+ | rst-backticks | Check if RST files use double backticks for code | | +--------------------------------------------------------+------------------------------------------------------------------+---------+ | run-flake8 | Run flake8 | * | diff --git a/airflow/cli/cli_parser.py b/airflow/cli/cli_parser.py index 0742417ae922a..11b38ebd31163 100644 --- a/airflow/cli/cli_parser.py +++ b/airflow/cli/cli_parser.py @@ -671,13 +671,13 @@ def string_lower_type(val): ARG_WITHOUT_MINGLE = Arg( ("--without-mingle",), default=False, - help="Don’t synchronize with other workers at start-up", + help="Don't synchronize with other workers at start-up", action="store_true", ) ARG_WITHOUT_GOSSIP = Arg( ("--without-gossip",), default=False, - help="Don’t subscribe to other workers events", + help="Don't subscribe to other workers events", action="store_true", ) diff --git a/airflow/providers/apache/kylin/operators/kylin_cube.py b/airflow/providers/apache/kylin/operators/kylin_cube.py index 5fe91ee831934..18bd0966e0c13 100644 --- a/airflow/providers/apache/kylin/operators/kylin_cube.py +++ b/airflow/providers/apache/kylin/operators/kylin_cube.py @@ -46,14 +46,14 @@ class KylinCubeOperator(BaseOperator): :param command: (kylin command include 'build', 'merge', 'refresh', 'delete', 'build_streaming', 'merge_streaming', 'refresh_streaming', 'disable', 'enable', 'purge', 'clone', 'drop'. - build - use /kylin/api/cubes/{cubeName}/build rest api,and buildType is ‘BUILD’, + build - use /kylin/api/cubes/{cubeName}/build rest api,and buildType is 'BUILD', and you should give start_time and end_time - refresh - use build rest api,and buildType is ‘REFRESH’ - merge - use build rest api,and buildType is ‘MERGE’ - build_streaming - use /kylin/api/cubes/{cubeName}/build2 rest api,and buildType is ‘BUILD’ + refresh - use build rest api,and buildType is 'REFRESH' + merge - use build rest api,and buildType is 'MERGE' + build_streaming - use /kylin/api/cubes/{cubeName}/build2 rest api,and buildType is 'BUILD' and you should give offset_start and offset_end - refresh_streaming - use build2 rest api,and buildType is ‘REFRESH’ - merge_streaming - use build2 rest api,and buildType is ‘MERGE’ + refresh_streaming - use build2 rest api,and buildType is 'REFRESH' + merge_streaming - use build2 rest api,and buildType is 'MERGE' delete - delete segment, and you should give segment_name value disable - disable cube enable - enable cube diff --git a/airflow/providers/databricks/operators/databricks.py b/airflow/providers/databricks/operators/databricks.py index 028c428ac16bf..6c70601198902 100644 --- a/airflow/providers/databricks/operators/databricks.py +++ b/airflow/providers/databricks/operators/databricks.py @@ -508,7 +508,7 @@ class DatabricksRunNowOperator(BaseOperator): The map is passed to the notebook and will be accessible through the dbutils.widgets.get function. See Widgets for more information. If not specified upon run-now, the triggered run will use the - job’s base parameters. notebook_params cannot be + job's base parameters. notebook_params cannot be specified in conjunction with jar_params. The json representation of this field (i.e. {"notebook_params":{"name":"john doe","age":"35"}}) cannot exceed 10,000 bytes. diff --git a/airflow/providers/datadog/sensors/datadog.py b/airflow/providers/datadog/sensors/datadog.py index 7dbcec80676d6..7a79d8c5db412 100644 --- a/airflow/providers/datadog/sensors/datadog.py +++ b/airflow/providers/datadog/sensors/datadog.py @@ -42,10 +42,10 @@ class DatadogSensor(BaseSensorOperator): :param sources: A comma separated list indicating what tags, if any, should be used to filter the list of monitors by scope :param tags: Get datadog events from specific sources. - :param response_check: A check against the ‘requests’ response object. The callable takes + :param response_check: A check against the 'requests' response object. The callable takes the response object as the first positional argument and optionally any number of keyword arguments available in the context dictionary. It should return True for - ‘pass’ and False otherwise. + 'pass' and False otherwise. :param response_check: Optional[Callable[[Dict[str, Any]], bool]] """ diff --git a/airflow/providers/google/cloud/hooks/automl.py b/airflow/providers/google/cloud/hooks/automl.py index aace9119934b9..740f782a15968 100644 --- a/airflow/providers/google/cloud/hooks/automl.py +++ b/airflow/providers/google/cloud/hooks/automl.py @@ -475,7 +475,7 @@ def deploy_model( """ Deploys a model. If a model is already deployed, deploying it with the same parameters has no effect. Deploying with different parameters (as e.g. changing node_number) will - reset the deployment state without pausing the model_id’s availability. + reset the deployment state without pausing the model_id's availability. Only applicable for Text Classification, Image Object Detection and Tables; all other domains manage deployment automatically. diff --git a/airflow/providers/google/cloud/hooks/pubsub.py b/airflow/providers/google/cloud/hooks/pubsub.py index 82fe34c33186c..8a9ac5ff55290 100644 --- a/airflow/providers/google/cloud/hooks/pubsub.py +++ b/airflow/providers/google/cloud/hooks/pubsub.py @@ -333,7 +333,7 @@ def create_subscription( in which they are received by the Pub/Sub system. Otherwise, they may be delivered in any order. :param expiration_policy: A policy that specifies the conditions for this - subscription’s expiration. A subscription is considered active as long as any + subscription's expiration. A subscription is considered active as long as any connected subscriber is successfully consuming messages from the subscription or is issuing operations on the subscription. If expiration_policy is not set, a default policy with ttl of 31 days will be used. The minimum allowed value for diff --git a/airflow/providers/google/cloud/operators/automl.py b/airflow/providers/google/cloud/operators/automl.py index d42b4f1a248dc..1877c3f086637 100644 --- a/airflow/providers/google/cloud/operators/automl.py +++ b/airflow/providers/google/cloud/operators/automl.py @@ -808,7 +808,7 @@ class AutoMLDeployModelOperator(BaseOperator): """ Deploys a model. If a model is already deployed, deploying it with the same parameters has no effect. Deploying with different parameters (as e.g. changing node_number) will - reset the deployment state without pausing the model_id’s availability. + reset the deployment state without pausing the model_id's availability. Only applicable for Text Classification, Image Object Detection and Tables; all other domains manage deployment automatically. diff --git a/airflow/providers/google/cloud/operators/pubsub.py b/airflow/providers/google/cloud/operators/pubsub.py index 7b74427b68c1d..d8974398516ac 100644 --- a/airflow/providers/google/cloud/operators/pubsub.py +++ b/airflow/providers/google/cloud/operators/pubsub.py @@ -265,7 +265,7 @@ class PubSubCreateSubscriptionOperator(BaseOperator): in which they are received by the Pub/Sub system. Otherwise, they may be delivered in any order. :param expiration_policy: A policy that specifies the conditions for this - subscription’s expiration. A subscription is considered active as long as any + subscription's expiration. A subscription is considered active as long as any connected subscriber is successfully consuming messages from the subscription or is issuing operations on the subscription. If expiration_policy is not set, a default policy with ttl of 31 days will be used. The minimum allowed value for diff --git a/airflow/providers/google/cloud/operators/vision.py b/airflow/providers/google/cloud/operators/vision.py index 7df9946c1ffe5..c1043097dcb17 100644 --- a/airflow/providers/google/cloud/operators/vision.py +++ b/airflow/providers/google/cloud/operators/vision.py @@ -241,7 +241,7 @@ class CloudVisionUpdateProductSetOperator(BaseOperator): :param project_id: (Optional) The project in which the ProductSet should be created. If set to None or missing, the default project_id from the Google Cloud connection is used. :param update_mask: (Optional) The `FieldMask` that specifies which fields to update. If update_mask - isn’t specified, all mutable fields are to be updated. Valid mask path is display_name. If a dict is + isn't specified, all mutable fields are to be updated. Valid mask path is display_name. If a dict is provided, it must be of the same form as the protobuf message `FieldMask`. :param retry: (Optional) A retry object used to retry requests. If `None` is specified, requests will not be retried. @@ -612,7 +612,7 @@ class CloudVisionUpdateProductOperator(BaseOperator): :param project_id: (Optional) The project in which the Product is located. If set to None or missing, the default project_id from the Google Cloud connection is used. :param update_mask: (Optional) The `FieldMask` that specifies which fields to update. If update_mask - isn’t specified, all mutable fields are to be updated. Valid mask paths include product_labels, + isn't specified, all mutable fields are to be updated. Valid mask paths include product_labels, display_name, and description. If a dict is provided, it must be of the same form as the protobuf message `FieldMask`. :param retry: (Optional) A retry object used to retry requests. If `None` is @@ -1034,7 +1034,7 @@ class CloudVisionAddProductToProductSetOperator(BaseOperator): Possible errors: - - Returns `NOT_FOUND` if the Product or the ProductSet doesn’t exist. + - Returns `NOT_FOUND` if the Product or the ProductSet doesn't exist. .. seealso:: For more information on how to use this operator, take a look at the guide: diff --git a/airflow/providers/google/cloud/utils/field_validator.py b/airflow/providers/google/cloud/utils/field_validator.py index 974c3b4559bbc..f3fdee8fdf553 100644 --- a/airflow/providers/google/cloud/utils/field_validator.py +++ b/airflow/providers/google/cloud/utils/field_validator.py @@ -102,7 +102,7 @@ Forward-compatibility notes --------------------------- Certain decisions are crucial to allow the client APIs to work also with future API -versions. Since body attached is passed to the API’s call, this is entirely +versions. Since body attached is passed to the API's call, this is entirely possible to pass-through any new fields in the body (for future API versions) - albeit without validation on the client side - they can and will still be validated on the server side usually. @@ -120,7 +120,7 @@ remains successful). This is very nice feature to protect against typos in names. * For unions, newly added union variants can be added by future calls and they will pass validation, however the content or presence of those fields will not be validated. - This means that it’s possible to send a new non-validated union field together with an + This means that it's possible to send a new non-validated union field together with an old validated field and this problem will not be detected by the client. In such case warning will be printed. * When you add validator to an operator, you should also add ``validate_body`` parameter diff --git a/airflow/providers/pagerduty/hooks/pagerduty.py b/airflow/providers/pagerduty/hooks/pagerduty.py index 98be80965aaab..2637f96849c91 100644 --- a/airflow/providers/pagerduty/hooks/pagerduty.py +++ b/airflow/providers/pagerduty/hooks/pagerduty.py @@ -121,7 +121,7 @@ def create_event( :param custom_details: Free-form details from the event. Can be a dictionary or a string. If a dictionary is passed it will show up in PagerDuty as a table. :param group: A cluster or grouping of sources. For example, sources - “prod-datapipe-02” and “prod-datapipe-03” might both be part of “prod-datapipe” + "prod-datapipe-02" and "prod-datapipe-03" might both be part of "prod-datapipe" :param component: The part or component of the affected system that is broken. :param class_type: The class/type of the event. :param images: List of images to include. Each dictionary in the list accepts the following keys: diff --git a/airflow/providers/pagerduty/hooks/pagerduty_events.py b/airflow/providers/pagerduty/hooks/pagerduty_events.py index c5eaffe105284..12eb1d116a2fd 100644 --- a/airflow/providers/pagerduty/hooks/pagerduty_events.py +++ b/airflow/providers/pagerduty/hooks/pagerduty_events.py @@ -97,7 +97,7 @@ def create_event( :param custom_details: Free-form details from the event. Can be a dictionary or a string. If a dictionary is passed it will show up in PagerDuty as a table. :param group: A cluster or grouping of sources. For example, sources - “prod-datapipe-02” and “prod-datapipe-03” might both be part of “prod-datapipe” + "prod-datapipe-02" and "prod-datapipe-03" might both be part of "prod-datapipe" :param component: The part or component of the affected system that is broken. :param class_type: The class/type of the event. :param images: List of images to include. Each dictionary in the list accepts the following keys: diff --git a/airflow/providers/qubole/operators/qubole.py b/airflow/providers/qubole/operators/qubole.py index 15a39c61bfaa1..838b5dff69722 100644 --- a/airflow/providers/qubole/operators/qubole.py +++ b/airflow/providers/qubole/operators/qubole.py @@ -176,8 +176,8 @@ class QuboleOperator(BaseOperator): jupytercmd: :path: Path including name of the Jupyter notebook to be run with extension. :arguments: Valid JSON to be sent to the notebook. Specify the parameters in notebooks and pass - the parameter value using the JSON format. key is the parameter’s name and value is - the parameter’s value. Supported types in parameters are string, integer, float and boolean. + the parameter value using the JSON format. key is the parameter's name and value is + the parameter's value. Supported types in parameters are string, integer, float and boolean. .. note: diff --git a/airflow/providers/tableau/hooks/tableau.py b/airflow/providers/tableau/hooks/tableau.py index e0d890b605bfd..e890f49c21c6a 100644 --- a/airflow/providers/tableau/hooks/tableau.py +++ b/airflow/providers/tableau/hooks/tableau.py @@ -153,7 +153,7 @@ def get_job_status(self, job_id: str) -> TableauJobFinishCode: .. see also:: https://tableau.github.io/server-client-python/docs/api-ref#jobs :param job_id: The id of the job to check. - :return: An Enum that describe the Tableau job’s return code + :return: An Enum that describe the Tableau job's return code :rtype: TableauJobFinishCode """ return TableauJobFinishCode(int(self.server.jobs.get_by_id(job_id).finish_code)) @@ -164,7 +164,7 @@ def wait_for_state(self, job_id: str, target_state: TableauJobFinishCode, check_ to target_state or different from PENDING. :param job_id: The id of the job to check. - :param target_state: Enum that describe the Tableau job’s target state + :param target_state: Enum that describe the Tableau job's target state :param check_interval: time in seconds that the job should wait in between each instance state checks until operation is completed :return: return True if the job is equal to the target_status, False otherwise. diff --git a/airflow/settings.py b/airflow/settings.py index e8bf80a2d929b..bea68ec8cfc10 100644 --- a/airflow/settings.py +++ b/airflow/settings.py @@ -340,7 +340,7 @@ def prepare_engine_args(disable_connection_pool=False): # When those additional connections are returned to the pool, they are disconnected and discarded. # It follows then that the total number of simultaneous connections # the pool will allow is pool_size + max_overflow, - # and the total number of “sleeping” connections the pool will allow is pool_size. + # and the total number of "sleeping" connections the pool will allow is pool_size. # max_overflow can be set to -1 to indicate no overflow limit; # no limit will be placed on the total number # of concurrent connections. Defaults to 10. @@ -353,7 +353,7 @@ def prepare_engine_args(disable_connection_pool=False): pool_recycle = conf.getint('database', 'SQL_ALCHEMY_POOL_RECYCLE', fallback=1800) # Check connection at the start of each connection pool checkout. - # Typically, this is a simple statement like “SELECT 1”, but may also make use + # Typically, this is a simple statement like "SELECT 1", but may also make use # of some DBAPI-specific method to test the connection for liveness. # More information here: # https://docs.sqlalchemy.org/en/13/core/pooling.html#disconnect-handling-pessimistic diff --git a/airflow/www/forms.py b/airflow/www/forms.py index 3a07cbed9310d..d7cb9a9d6b364 100644 --- a/airflow/www/forms.py +++ b/airflow/www/forms.py @@ -139,7 +139,7 @@ class DagRunEditForm(DynamicForm): conf = TextAreaField(lazy_gettext('Conf'), widget=BS3TextAreaROWidget()) def populate_obj(self, item): - """Populates the attributes of the passed obj with data from the form’s fields.""" + """Populates the attributes of the passed obj with data from the form's fields.""" super().populate_obj(item) item.run_type = DagRunType.from_run_id(item.run_id) if item.conf: diff --git a/chart/templates/cleanup/cleanup-cronjob.yaml b/chart/templates/cleanup/cleanup-cronjob.yaml index 365651ecb6435..022e7dda596b6 100644 --- a/chart/templates/cleanup/cleanup-cronjob.yaml +++ b/chart/templates/cleanup/cleanup-cronjob.yaml @@ -43,7 +43,7 @@ metadata: {{- end }} spec: schedule: "{{ .Values.cleanup.schedule }}" - # The cron job does not allow concurrent runs; if it is time for a new job run and the previous job run hasn’t finished yet, the cron job skips the new job run + # The cron job does not allow concurrent runs; if it is time for a new job run and the previous job run hasn't finished yet, the cron job skips the new job run concurrencyPolicy: Forbid jobTemplate: spec: diff --git a/dev/README_RELEASE_AIRFLOW.md b/dev/README_RELEASE_AIRFLOW.md index 7e2cc8da83e9d..bef703e101659 100644 --- a/dev/README_RELEASE_AIRFLOW.md +++ b/dev/README_RELEASE_AIRFLOW.md @@ -828,7 +828,7 @@ Hello, Apache Airflow 2.0.2 (based on RC3) has been accepted. -4 “+1” binding votes received: +4 "+1" binding votes received: - Kaxil Naik - Bolke de Bruin - Ash Berlin-Taylor @@ -1103,12 +1103,12 @@ Post this in the #announce channel: ```shell cat <`__ is an open source, unified model for defining both batch and streaming data-parallel processing pipelines. Using one of the open source Beam SDKs, you build a program -that defines the pipeline. The pipeline is then executed by one of Beam’s supported distributed processing +that defines the pipeline. The pipeline is then executed by one of Beam's supported distributed processing back-ends, which include Apache Flink, Apache Spark, and Google Cloud Dataflow. diff --git a/docs/apache-airflow-providers-ftp/connections/ftp.rst b/docs/apache-airflow-providers-ftp/connections/ftp.rst index 81f4681c96966..a3280df8ed911 100644 --- a/docs/apache-airflow-providers-ftp/connections/ftp.rst +++ b/docs/apache-airflow-providers-ftp/connections/ftp.rst @@ -52,7 +52,7 @@ Extra (optional) Specify the extra parameters (as json dictionary) that can be used in ftp connection. The following parameters are all optional: - * ``passive``: Enable “passive” mode if val is true, otherwise disable passive mode. + * ``passive``: Enable "passive" mode if val is true, otherwise disable passive mode. Passive mode is on by default. When specifying the connection in environment variable you should specify diff --git a/docs/apache-airflow-providers-google/operators/cloud/datafusion.rst b/docs/apache-airflow-providers-google/operators/cloud/datafusion.rst index ab44404b1052f..c9d6f1d86b43c 100644 --- a/docs/apache-airflow-providers-google/operators/cloud/datafusion.rst +++ b/docs/apache-airflow-providers-google/operators/cloud/datafusion.rst @@ -23,7 +23,7 @@ Google DataFusion Operators Cloud Data Fusion is a fully managed, cloud-native data integration service that helps users efficiently build and manage ETL/ELT data pipelines. With a graphical interface and a broad open source library of preconfigured connectors and transformations, Cloud -Data Fusion shifts an organization’s focus away from code and integration to insights +Data Fusion shifts an organization's focus away from code and integration to insights and action. Prerequisite Tasks diff --git a/docs/apache-airflow-providers-google/operators/cloud/mlengine.rst b/docs/apache-airflow-providers-google/operators/cloud/mlengine.rst index 93da6387d497d..fceeeee83c6a5 100644 --- a/docs/apache-airflow-providers-google/operators/cloud/mlengine.rst +++ b/docs/apache-airflow-providers-google/operators/cloud/mlengine.rst @@ -93,7 +93,7 @@ Creating model versions A model version is a subset of the model container where the code runs. A new version of the model can be created through the :class:`~airflow.providers.google.cloud.operators.mlengine.MLEngineCreateVersionOperator`. The model must be specified by ``model_name``, and the ``version`` parameter should contain a dictionary of -all the information about the version. Within the ``version`` parameter’s dictionary, the ``name`` field is +all the information about the version. Within the ``version`` parameter's dictionary, the ``name`` field is required. .. exampleinclude:: /../../airflow/providers/google/cloud/example_dags/example_mlengine.py @@ -199,7 +199,7 @@ prediction result, then returns a tuple of metrics. :start-after: [START howto_operator_gcp_mlengine_get_metric] :end-before: [END howto_operator_gcp_mlengine_get_metric] -To evaluate a prediction and model, it’s useful to have a function to validate the summary result. +To evaluate a prediction and model, it's useful to have a function to validate the summary result. This function receives a dictionary of the averaged metrics the function above generated. It then raises an exception if a task fails or should not proceed. diff --git a/docs/apache-airflow-providers-google/operators/cloud/pubsub.rst b/docs/apache-airflow-providers-google/operators/cloud/pubsub.rst index 36f9c8e27621c..6e7f073a087b1 100644 --- a/docs/apache-airflow-providers-google/operators/cloud/pubsub.rst +++ b/docs/apache-airflow-providers-google/operators/cloud/pubsub.rst @@ -22,7 +22,7 @@ Google Cloud PubSub Operators `Google Cloud PubSub `__ is a fully-managed real-time messaging service that allows you to send and receive messages between independent applications. -You can leverage Cloud Pub/Sub’s flexibility to decouple systems and components hosted +You can leverage Cloud Pub/Sub's flexibility to decouple systems and components hosted on Google Cloud or elsewhere on the Internet. Publisher applications can send messages to a topic and other applications can subscribe to that topic to receive the messages. diff --git a/docs/apache-airflow-providers-sftp/connections/sftp.rst b/docs/apache-airflow-providers-sftp/connections/sftp.rst index 95c88ebb645a3..6cc3cb47fbb24 100644 --- a/docs/apache-airflow-providers-sftp/connections/sftp.rst +++ b/docs/apache-airflow-providers-sftp/connections/sftp.rst @@ -72,7 +72,7 @@ Extra (optional) * ``disabled_algorithms`` - A dictionary mapping algorithm type to an iterable of algorithm identifiers, which will be disabled for the lifetime of the transport. * ``ciphers`` - A list of ciphers to use in order of preference. -Example “extras” field using ``host_key``: +Example "extras" field using ``host_key``: .. code-block:: json @@ -82,7 +82,7 @@ Example “extras” field using ``host_key``: "host_key": "AAAHD...YDWwq==" } -Example “extras” field using ``key_file`` or ``private_key``: +Example "extras" field using ``key_file`` or ``private_key``: .. code-block:: json diff --git a/docs/apache-airflow-providers-tableau/connections/tableau.rst b/docs/apache-airflow-providers-tableau/connections/tableau.rst index 05d576670c3b2..2035ff92730e5 100644 --- a/docs/apache-airflow-providers-tableau/connections/tableau.rst +++ b/docs/apache-airflow-providers-tableau/connections/tableau.rst @@ -69,7 +69,7 @@ Extra (optional) The following parameters are all optional: * ``site_id``: This corresponds to the contentUrl attribute in the Tableau REST API. The ``site_id`` is the portion of - the URL that follows the /site/ in the URL. For example, “MarketingTeam” is the ``site_id`` in the following URL + the URL that follows the /site/ in the URL. For example, "MarketingTeam" is the ``site_id`` in the following URL MyServer/#/site/MarketingTeam/projects. To specify the default site on Tableau Server, you can use an empty string '' (single quotes, no space). For Tableau Online, you must provide a value for the ``site_id.`` This is used for both token and password Authentication. @@ -77,8 +77,8 @@ Extra (optional) This is used with token authentication. * ``personal_access_token``: The personal access token value. This is used with token authentication. - * ``verify``: Either a boolean, in which case it controls whether we verify the server’s TLS certificate, or a string, in which case it must be a path to a CA bundle to use. Defaults to True. - * ``cert``: if String, path to ssl client cert file (.pem). If Tuple, (‘cert’, ‘key’) pair. + * ``verify``: Either a boolean, in which case it controls whether we verify the server's TLS certificate, or a string, in which case it must be a path to a CA bundle to use. Defaults to True. + * ``cert``: if String, path to ssl client cert file (.pem). If Tuple, ('cert', 'key') pair. When specifying the connection in environment variable you should specify diff --git a/docs/apache-airflow/concepts/scheduler.rst b/docs/apache-airflow/concepts/scheduler.rst index 0ee6724699466..655633e83d771 100644 --- a/docs/apache-airflow/concepts/scheduler.rst +++ b/docs/apache-airflow/concepts/scheduler.rst @@ -44,7 +44,7 @@ Your DAGs will start executing once the scheduler is running successfully. .. note:: The first DAG Run is created based on the minimum ``start_date`` for the tasks in your DAG. - Subsequent DAG Runs are created by the scheduler process, based on your DAG’s ``schedule_interval``, + Subsequent DAG Runs are created by the scheduler process, based on your DAG's ``schedule_interval``, sequentially. @@ -56,7 +56,7 @@ In the UI, it appears as if Airflow is running your tasks a day **late** If you run a DAG on a ``schedule_interval`` of one day, the run with data interval starting on ``2019-11-21`` triggers after ``2019-11-21T23:59``. - **Let’s Repeat That**, the scheduler runs your job one ``schedule_interval`` AFTER the start date, at the END of the interval. + **Let's Repeat That**, the scheduler runs your job one ``schedule_interval`` AFTER the start date, at the END of the interval. You should refer to :doc:`/dag-run` for details on scheduling a DAG. @@ -382,6 +382,6 @@ However you can also look at other non-performance-related scheduler configurati renamed in the future with deprecation of the current name. - :ref:`config:scheduler__schedule_after_task_execution` - Should the Task supervisor process perform a “mini scheduler” to attempt to schedule more tasks of + Should the Task supervisor process perform a "mini scheduler" to attempt to schedule more tasks of the same DAG. Leaving this on will mean tasks in the same DAG execute quicker, but might starve out other DAGs in some circumstances. diff --git a/docs/apache-airflow/concepts/smart-sensors.rst b/docs/apache-airflow/concepts/smart-sensors.rst index 0113ce401aec5..f8fcbc95d7b2d 100644 --- a/docs/apache-airflow/concepts/smart-sensors.rst +++ b/docs/apache-airflow/concepts/smart-sensors.rst @@ -29,7 +29,7 @@ Smart Sensors efficiency gains. If you are considering writing a new Smart Sensor, you should instead write it as a Deferrable Operator. -The smart sensor is a service (run by a builtin DAG) which greatly reduces Airflow’s infrastructure +The smart sensor is a service (run by a builtin DAG) which greatly reduces Airflow's infrastructure cost by consolidating multiple instances of small, light-weight Sensors into a single process. .. image:: /img/smart_sensor_architecture.png @@ -45,15 +45,15 @@ In this way, we only need a handful of running processes. .. image:: /img/smart_sensor_single_task_execute_flow.png -The smart sensor service is supported in a new mode called “smart sensor mode”. In smart sensor mode, +The smart sensor service is supported in a new mode called "smart sensor mode". In smart sensor mode, instead of holding a long running process for each sensor and poking periodically, a sensor will only -store poke context at sensor_instance table and then exits with a ‘sensing’ state. +store poke context at sensor_instance table and then exits with a 'sensing' state. When the smart sensor mode is enabled, a special set of builtin smart sensor DAGs (named smart_sensor_group_shard_xxx) is created by the system; These DAGs contain ``SmartSensorOperator`` task and manage the smart sensor jobs for the Airflow cluster. The SmartSensorOperator task can fetch -hundreds of ‘sensing’ instances from sensor_instance table and poke on behalf of them in batches. -Users don’t need to change their existing DAGs. +hundreds of 'sensing' instances from sensor_instance table and poke on behalf of them in batches. +Users don't need to change their existing DAGs. Enable/Disable Smart Sensor --------------------------- @@ -77,13 +77,13 @@ Add the following settings in the ``airflow.cfg``: * ``shards``: This config indicates the number of concurrently running smart sensor jobs for the Airflow cluster. * ``sensors_enabled``: This config is a list of sensor class names that will use the smart sensor. - The users use the same class names (e.g. HivePartitionSensor) in their DAGs and they don’t have + The users use the same class names (e.g. HivePartitionSensor) in their DAGs and they don't have the control to use smart sensors or not, unless they exclude their tasks explicitly. Enabling/disabling the smart sensor service is a system level configuration change. It is transparent to the individual users. Existing DAGs don't need to be changed for enabling/disabling the smart sensor. Rotating centralized smart sensor tasks will not -cause any user’s sensor task failure. +cause any user's sensor task failure. * Using callback arguments (``on_success_callback``, ``on_failure_callback``, and ``on_retry_callback``) on a sensor task is not compatible with the smart sensor mode. If any callback arguments are provided, the sensor task will not be executed when the smart sensor mode is enabled. diff --git a/docs/apache-airflow/dag-run.rst b/docs/apache-airflow/dag-run.rst index a91017bd05290..f3357e9785917 100644 --- a/docs/apache-airflow/dag-run.rst +++ b/docs/apache-airflow/dag-run.rst @@ -159,7 +159,7 @@ just after midnight on the morning of 2016-01-03 with a data interval between If the ``dag.catchup`` value had been ``True`` instead, the scheduler would have created a DAG Run for each completed interval between 2015-12-01 and 2016-01-02 (but not yet one for 2016-01-02, -as that interval hasn’t completed) and the scheduler will execute them sequentially. +as that interval hasn't completed) and the scheduler will execute them sequentially. Catchup is also triggered when you turn off a DAG for a specified period and then re-enable it. @@ -229,7 +229,7 @@ Note that DAG Runs can also be created manually through the CLI. Just run the co airflow dags trigger --exec-date logical_date run_id -The DAG Runs created externally to the scheduler get associated with the trigger’s timestamp and are displayed +The DAG Runs created externally to the scheduler get associated with the trigger's timestamp and are displayed in the UI alongside scheduled DAG runs. The logical date passed inside the DAG can be specified using the ``-e`` argument. The default is the current date in the UTC timezone. diff --git a/docs/apache-airflow/privacy_notice.rst b/docs/apache-airflow/privacy_notice.rst index 0356b677811f4..e07e509fabc86 100644 --- a/docs/apache-airflow/privacy_notice.rst +++ b/docs/apache-airflow/privacy_notice.rst @@ -53,7 +53,7 @@ GA provides us with information on the following 3 categories: - Audience reports: includes information about active users (interactions with the site in the last 1, 7, 14, or 30 days), where users come from (geographical location) and what language they speak (language of the browser), as well as the insight on the percentages of new and returning visitors. - Acquisition reports: includes information about how users are finding the website with detailed understanding of traffic and bounce rate. Website owners are able to see their main traffic categories, such as organic search, referral and direct, as well as the information about how many pages users view and how much time they spend on the website. -- Behaviour reports: includes information on granulated metrics such as the average time a user spends on a webpage, total number of pageviews and the site’s bounce rate, site’s most visited pages, and the most popular pages through which a user enters and exits your site (landing pages and exit pages), as well as insight on how quickly the website loads. +- Behaviour reports: includes information on granulated metrics such as the average time a user spends on a webpage, total number of pageviews and the site's bounce rate, site's most visited pages, and the most popular pages through which a user enters and exits your site (landing pages and exit pages), as well as insight on how quickly the website loads. Information collected has to do with your browser, time spent on each page, pages visited, and location (down to a granularity of city, not further). We do not track or collect personally identifiable information or associate gathered data with any personally identifying information from other sources. diff --git a/docs/apache-airflow/production-deployment.rst b/docs/apache-airflow/production-deployment.rst index 64c8948db32bc..601ebae56c14a 100644 --- a/docs/apache-airflow/production-deployment.rst +++ b/docs/apache-airflow/production-deployment.rst @@ -118,7 +118,7 @@ To mitigate these issues, make sure you have a :doc:`health check ` for use in a containerized environment. Consider using it to guarantee that software will always run the same no matter where it’s deployed. +We provide :doc:`a Docker Image (OCI) for Apache Airflow ` for use in a containerized environment. Consider using it to guarantee that software will always run the same no matter where it's deployed. Helm Chart for Kubernetes ========================= diff --git a/docs/apache-airflow/release-process.rst b/docs/apache-airflow/release-process.rst index 8de37807327cc..e925dcf4d7eb2 100644 --- a/docs/apache-airflow/release-process.rst +++ b/docs/apache-airflow/release-process.rst @@ -25,7 +25,7 @@ Since Airflow 2.0.0 and provider packages 1.0.0 we aim to follow SemVer, meaning - X is the major version number. - Y is the minor version number, also called the *feature release* version number. - Z is the patch number, which is incremented for bugfix and security releases. - Before every new release, we’ll make a release candidate available, and often alpha or beta release too. + Before every new release, we'll make a release candidate available, and often alpha or beta release too. These are of the form X.Y.Z alpha/beta/rc N, which means the Nth alpha/beta/release candidate of version X.Y.Z In git, each minor version will have its own branch, called ``vX-Y-stable`` where bugfix/security releases will be issued from. @@ -61,7 +61,7 @@ That is all SemVer is -- it's a statement of our intent as package authors, and These releases will be 100% compatible with the associated feature release. So the answer to "should I upgrade to the latest patch release?" will always be "yes." - The only exception to the above with respect to 100% backwards compatibility is when a security or data loss issue can’t be fixed without breaking backwards-compatibility. + The only exception to the above with respect to 100% backwards compatibility is when a security or data loss issue can't be fixed without breaking backwards-compatibility. If this happens, the release notes will provide detailed upgrade instructions. **No new features will be added in patch releases** diff --git a/docs/apache-airflow/security/secrets/fernet.rst b/docs/apache-airflow/security/secrets/fernet.rst index 28c06fee70001..341e2186bd377 100644 --- a/docs/apache-airflow/security/secrets/fernet.rst +++ b/docs/apache-airflow/security/secrets/fernet.rst @@ -22,7 +22,7 @@ Fernet Airflow uses `Fernet `__ to encrypt passwords in the connection configuration and the variable configuration. It guarantees that a password encrypted using it cannot be manipulated or read without the key. -Fernet is an implementation of symmetric (also known as “secret key”) authenticated cryptography. +Fernet is an implementation of symmetric (also known as "secret key") authenticated cryptography. The first time Airflow is started, the ``airflow.cfg`` file is generated with the default configuration and the unique Fernet key. The key is saved to option ``fernet_key`` of section ``[core]``. diff --git a/docs/apache-airflow/timezone.rst b/docs/apache-airflow/timezone.rst index d49d3bab5ac9e..9e5e65e3c521c 100644 --- a/docs/apache-airflow/timezone.rst +++ b/docs/apache-airflow/timezone.rst @@ -22,21 +22,21 @@ Time Zones Support for time zones is enabled by default. Airflow stores datetime information in UTC internally and in the database. It allows you to run your DAGs with time zone dependent schedules. At the moment, Airflow does not convert them to the -end user’s time zone in the user interface. It will always be displayed in UTC there. Also, templates used in Operators +end user's time zone in the user interface. It will always be displayed in UTC there. Also, templates used in Operators are not converted. Time zone information is exposed and it is up to the writer of DAG to decide what do with it. This is handy if your users live in more than one time zone and you want to display datetime information according to -each user’s wall clock. +each user's wall clock. Even if you are running Airflow in only one time zone, it is still good practice to store data in UTC in your database (also before Airflow became time zone aware this was also the recommended or even required setup). The main reason is that many countries use Daylight Saving Time (DST), where clocks are moved forward in spring and backward -in autumn. If you’re working in local time, you’re likely to encounter errors twice a year, when the transitions -happen. (The pendulum and pytz documentation discuss these issues in greater detail.) This probably doesn’t matter -for a simple DAG, but it’s a problem if you are in, for example, financial services where you have end of day +in autumn. If you're working in local time, you're likely to encounter errors twice a year, when the transitions +happen. (The pendulum and pytz documentation discuss these issues in greater detail.) This probably doesn't matter +for a simple DAG, but it's a problem if you are in, for example, financial services where you have end of day deadlines to meet. -The time zone is set in ``airflow.cfg``. By default it is set to UTC, but you change it to use the system’s settings or +The time zone is set in ``airflow.cfg``. By default it is set to UTC, but you change it to use the system's settings or an arbitrary IANA time zone, e.g. ``Europe/Amsterdam``. It is dependent on ``pendulum``, which is more accurate than ``pytz``. Pendulum is installed when you install Airflow. @@ -63,9 +63,9 @@ Concepts Naive and aware datetime objects '''''''''''''''''''''''''''''''' -Python’s datetime.datetime objects have a tzinfo attribute that can be used to store time zone information, +Python's datetime.datetime objects have a tzinfo attribute that can be used to store time zone information, represented as an instance of a subclass of datetime.tzinfo. When this attribute is set and describes an offset, -a datetime object is aware. Otherwise, it’s naive. +a datetime object is aware. Otherwise, it's naive. You can use ``timezone.is_localized()`` and ``timezone.is_naive()`` to determine whether datetimes are aware or naive. @@ -99,12 +99,12 @@ words if you have a default time zone setting of ``Europe/Amsterdam`` and create op = BashOperator(task_id="dummy", bash_command="Hello World!", dag=dag) print(op.retries) # 3 -Unfortunately, during DST transitions, some datetimes don’t exist or are ambiguous. -In such situations, pendulum raises an exception. That’s why you should always create aware +Unfortunately, during DST transitions, some datetimes don't exist or are ambiguous. +In such situations, pendulum raises an exception. That's why you should always create aware datetime objects when time zone support is enabled. In practice, this is rarely an issue. Airflow gives you time zone aware datetime objects in the models and DAGs, and most often, -new datetime objects are created from existing ones through timedelta arithmetic. The only datetime that’s often +new datetime objects are created from existing ones through timedelta arithmetic. The only datetime that's often created in application code is the current time, and ``timezone.utcnow()`` automatically does the right thing. diff --git a/docs/apache-airflow/usage-cli.rst b/docs/apache-airflow/usage-cli.rst index c14efacb1d2ff..14f88f03fc948 100644 --- a/docs/apache-airflow/usage-cli.rst +++ b/docs/apache-airflow/usage-cli.rst @@ -52,7 +52,7 @@ For one-time activation of argcomplete for airflow only, use: .. image:: img/cli_completion.gif -If you’re using ``zsh``, add the following to your ``.zshrc``: +If you're using ``zsh``, add the following to your ``.zshrc``: .. code-block:: bash diff --git a/docs/conf.py b/docs/conf.py index b75377ddc747d..ce88dee2566c2 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -338,7 +338,7 @@ def _get_rst_filepath_from_path(filepath: str): {'href': '/ecosystem/', 'text': 'Ecosystem'}, ] -# A dictionary of values to pass into the template engine’s context for all pages. +# A dictionary of values to pass into the template engine's context for all pages. html_context = { # Google Analytics ID. # For more information look at: diff --git a/docs/docker-stack/build-arg-ref.rst b/docs/docker-stack/build-arg-ref.rst index f8e30c02ce76e..fb41e551412a9 100644 --- a/docs/docker-stack/build-arg-ref.rst +++ b/docs/docker-stack/build-arg-ref.rst @@ -40,7 +40,7 @@ Those are the most common arguments that you use when you want to build a custom | ``ADDITIONAL_AIRFLOW_EXTRAS`` | | Optional additional extras with which | | | | airflow is installed. | +------------------------------------------+------------------------------------------+---------------------------------------------+ -| ``AIRFLOW_HOME`` | ``/opt/airflow`` | Airflow’s HOME (that’s where logs and | +| ``AIRFLOW_HOME`` | ``/opt/airflow`` | Airflow's HOME (that's where logs and | | | | SQLite databases are stored). | +------------------------------------------+------------------------------------------+---------------------------------------------+ | ``AIRFLOW_USER_HOME_DIR`` | ``/home/airflow`` | Home directory of the Airflow user. | diff --git a/docs/exts/docs_build/dev_index_template.html.jinja2 b/docs/exts/docs_build/dev_index_template.html.jinja2 index e1b9e9bcf6d39..6a52742951e52 100644 --- a/docs/exts/docs_build/dev_index_template.html.jinja2 +++ b/docs/exts/docs_build/dev_index_template.html.jinja2 @@ -75,7 +75,7 @@

Docker image

- It makes efficient, lightweight, self-contained environment and guarantees that software will always run the same no matter of where it’s deployed. + It makes efficient, lightweight, self-contained environment and guarantees that software will always run the same no matter of where it's deployed.

diff --git a/docs/helm-chart/quick-start.rst b/docs/helm-chart/quick-start.rst index 0ffcbe9eb8143..1191121a859f0 100644 --- a/docs/helm-chart/quick-start.rst +++ b/docs/helm-chart/quick-start.rst @@ -29,7 +29,7 @@ We recommend testing with Kubernetes 1.20+, example: kind create cluster --image kindest/node:v1.21.1 -Confirm it’s up: +Confirm it's up: .. code-block:: bash diff --git a/images/breeze/output-commands-hash.txt b/images/breeze/output-commands-hash.txt index 5d9904c10d812..1644cf708ab7c 100644 --- a/images/breeze/output-commands-hash.txt +++ b/images/breeze/output-commands-hash.txt @@ -28,7 +28,7 @@ self-upgrade:b5437c0a1a91533a11ee9d0a9692369c setup-autocomplete:355b72dee171c2fcba46fc90ac7c97b0 shell:4680295fdd8a276d51518d29360c365c start-airflow:92cf775a952439a32d409cd2536da507 -static-checks:c7adc5d6dff34624ef413d3d146ec974 +static-checks:bc6b9a121ea38404ac5f28e727146b90 stop:8ebd8a42f1003495d37b884de5ac7ce6 tests:ae8d62b505ff8f79bddc202fe9d575e3 verify-image:a6b3c70957aea96a5d4d261f23359a2d diff --git a/images/breeze/output-static-checks.svg b/images/breeze/output-static-checks.svg index 90272c90b47b7..a1c8f7560eadc 100644 --- a/images/breeze/output-static-checks.svg +++ b/images/breeze/output-static-checks.svg @@ -19,237 +19,237 @@ font-weight: 700; } - .terminal-618500324-matrix { + .terminal-2165150644-matrix { font-family: Fira Code, monospace; font-size: 20px; line-height: 24.4px; font-variant-east-asian: full-width; } - .terminal-618500324-title { + .terminal-2165150644-title { font-size: 18px; font-weight: bold; font-family: arial; } - .terminal-618500324-r1 { fill: #c5c8c6;font-weight: bold } -.terminal-618500324-r2 { fill: #c5c8c6 } -.terminal-618500324-r3 { fill: #d0b344;font-weight: bold } -.terminal-618500324-r4 { fill: #868887 } -.terminal-618500324-r5 { fill: #68a0b3;font-weight: bold } -.terminal-618500324-r6 { fill: #98a84b;font-weight: bold } -.terminal-618500324-r7 { fill: #8d7b39 } + .terminal-2165150644-r1 { fill: #c5c8c6;font-weight: bold } +.terminal-2165150644-r2 { fill: #c5c8c6 } +.terminal-2165150644-r3 { fill: #d0b344;font-weight: bold } +.terminal-2165150644-r4 { fill: #868887 } +.terminal-2165150644-r5 { fill: #68a0b3;font-weight: bold } +.terminal-2165150644-r6 { fill: #98a84b;font-weight: bold } +.terminal-2165150644-r7 { fill: #8d7b39 } - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - Command: static-checks + Command: static-checks - + - - -Usage: breeze static-checks [OPTIONS] [PRECOMMIT_ARGS]... - -Run static checks. - -╭─ Pre-commit flags ───────────────────────────────────────────────────────────────────────────────────────────────────╮ ---type-tType(s) of the static checks to run (multiple can be added).                             -(all | black | blacken-docs | check-airflow-2-2-compatibility |                          -check-airflow-config-yaml-consistent | check-apache-license-rat |                        -check-base-operator-partial-arguments | check-base-operator-usage |                      -check-boring-cyborg-configuration | check-breeze-top-dependencies-limited |              -check-builtin-literals | check-changelog-has-no-duplicates |                             -check-daysago-import-from-utils | check-docstring-param-types | check-example-dags-urls  -| check-executables-have-shebangs | check-extra-packages-references | check-extras-order -| check-for-inclusive-language | check-hooks-apply | check-incorrect-use-of-LoggingMixin -| check-integrations-are-consistent | check-merge-conflict |                             -check-newsfragments-are-valid | check-no-providers-in-core-examples |                    -check-no-relative-imports | check-persist-credentials-disabled-in-github-workflows |     -check-pre-commit-information-consistent | check-provide-create-sessions-imports |        -check-provider-yaml-valid | check-providers-init-file-missing |                          -check-providers-subpackages-init-file-exist | check-pydevd-left-in-code |                -check-revision-heads-map | check-safe-filter-usage-in-html | check-setup-order |         -check-start-date-not-used-in-defaults | check-system-tests-present |                     -check-system-tests-tocs | check-xml | codespell | create-missing-init-py-files-tests |   -debug-statements | detect-private-key | doctoc | end-of-file-fixer | fix-encoding-pragma -| flynt | forbid-tabs | identity | insert-license | isort | lint-chart-schema | lint-css -| lint-dockerfile | lint-helm-chart | lint-javascript | lint-json-schema | lint-markdown -| lint-openapi | mixed-line-ending | pretty-format-json | pydocstyle |                   -python-no-log-warn | pyupgrade | rst-backticks | run-flake8 | run-mypy | run-shellcheck  -| static-check-autoflake | trailing-whitespace | update-breeze-cmd-output |              -update-breeze-readme-config-hash | update-extras | update-in-the-wild-to-be-sorted |     -update-inlined-dockerfile-scripts | update-local-yml-file | update-migration-references  -| update-providers-dependencies | update-setup-cfg-file |                                -update-spelling-wordlist-to-be-sorted | update-supported-versions |                      -update-vendored-in-k8s-json-schema | update-version | yamllint | yesqa)                  ---file-fList of files to run the checks on.(PATH) ---all-files-aRun checks on all files. ---show-diff-on-failure-sShow diff for files modified by the checks. ---last-commit-cRun checks for all files in last commit. Mutually exclusive with --commit-ref. -╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ ---commit-ref-rRun checks for this commit reference only (can be any git commit-ish reference). Mutually   -exclusive with --last-commit.                                                               -(TEXT)                                                                                      ---verbose-vPrint verbose information about performed steps. ---dry-run-DIf dry-run is set, commands are only printed, not executed. ---github-repository-gGitHub repository used to pull, push run images.(TEXT)[default: apache/airflow] ---help-hShow this message and exit. -╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ + + +Usage: breeze static-checks [OPTIONS] [PRECOMMIT_ARGS]... + +Run static checks. + +╭─ Pre-commit flags ───────────────────────────────────────────────────────────────────────────────────────────────────╮ +--type-tType(s) of the static checks to run (multiple can be added).                             +(all | black | blacken-docs | check-airflow-2-2-compatibility |                          +check-airflow-config-yaml-consistent | check-apache-license-rat |                        +check-base-operator-partial-arguments | check-base-operator-usage |                      +check-boring-cyborg-configuration | check-breeze-top-dependencies-limited |              +check-builtin-literals | check-changelog-has-no-duplicates |                             +check-daysago-import-from-utils | check-docstring-param-types | check-example-dags-urls  +| check-executables-have-shebangs | check-extra-packages-references | check-extras-order +| check-for-inclusive-language | check-hooks-apply | check-incorrect-use-of-LoggingMixin +| check-integrations-are-consistent | check-merge-conflict |                             +check-newsfragments-are-valid | check-no-providers-in-core-examples |                    +check-no-relative-imports | check-persist-credentials-disabled-in-github-workflows |     +check-pre-commit-information-consistent | check-provide-create-sessions-imports |        +check-provider-yaml-valid | check-providers-init-file-missing |                          +check-providers-subpackages-init-file-exist | check-pydevd-left-in-code |                +check-revision-heads-map | check-safe-filter-usage-in-html | check-setup-order |         +check-start-date-not-used-in-defaults | check-system-tests-present |                     +check-system-tests-tocs | check-xml | codespell | create-missing-init-py-files-tests |   +debug-statements | detect-private-key | doctoc | end-of-file-fixer | fix-encoding-pragma +| flynt | identity | insert-license | isort | lint-chart-schema | lint-css |             +lint-dockerfile | lint-helm-chart | lint-javascript | lint-json-schema | lint-markdown | +lint-openapi | mixed-line-ending | pretty-format-json | pydocstyle | python-no-log-warn  +| pyupgrade | replace-bad-characters | rst-backticks | run-flake8 | run-mypy |           +run-shellcheck | static-check-autoflake | trailing-whitespace | update-breeze-cmd-output +| update-breeze-readme-config-hash | update-extras | update-in-the-wild-to-be-sorted |   +update-inlined-dockerfile-scripts | update-local-yml-file | update-migration-references  +| update-providers-dependencies | update-setup-cfg-file |                                +update-spelling-wordlist-to-be-sorted | update-supported-versions |                      +update-vendored-in-k8s-json-schema | update-version | yamllint | yesqa)                  +--file-fList of files to run the checks on.(PATH) +--all-files-aRun checks on all files. +--show-diff-on-failure-sShow diff for files modified by the checks. +--last-commit-cRun checks for all files in last commit. Mutually exclusive with --commit-ref. +╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ +--commit-ref-rRun checks for this commit reference only (can be any git commit-ish reference). Mutually   +exclusive with --last-commit.                                                               +(TEXT)                                                                                      +--verbose-vPrint verbose information about performed steps. +--dry-run-DIf dry-run is set, commands are only printed, not executed. +--github-repository-gGitHub repository used to pull, push run images.(TEXT)[default: apache/airflow] +--help-hShow this message and exit. +╰──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ diff --git a/scripts/ci/pre_commit/pre_commit_replace_bad_characters.py b/scripts/ci/pre_commit/pre_commit_replace_bad_characters.py new file mode 100755 index 0000000000000..423255ee0e5eb --- /dev/null +++ b/scripts/ci/pre_commit/pre_commit_replace_bad_characters.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import re +import sys +from pathlib import Path +from typing import List, NamedTuple + +from rich.console import Console + +if __name__ != "__main__": + raise Exception( + "This file is intended to be executed as an executable program. You cannot use it as a module." + f"To run this script, run the {__file__} command" + ) + +console = Console(width=400, color_system="standard") + + +class RegexpSpec(NamedTuple): + regexp: str + replacement: str + description: str + + +REPLACEMENTS: List[RegexpSpec] = [ + RegexpSpec(regexp=r'\t', replacement=' ', description=' with 4 spaces'), + RegexpSpec(regexp=r'\u00A0', replacement=' ', description='  with space'), + RegexpSpec(regexp=r'\u2018', replacement="'", description='left single quotation with straight one'), + RegexpSpec(regexp=r'\u2019', replacement="'", description='right single quotation with straight one'), + RegexpSpec(regexp=r'\u201C', replacement='"', description='left double quotation with straight one'), + RegexpSpec(regexp=r'\u201D', replacement='"', description='right double quotation with straight one'), +] + + +def main() -> int: + total_count_changes = 0 + matches = [re.compile(spec.regexp) for spec in REPLACEMENTS] + for file_string in sys.argv: + count_changes = 0 + path = Path(file_string) + text = path.read_text() + for index in range(len(matches)): + current_match = matches[index] + text, new_count_changes = current_match.subn(REPLACEMENTS[index].replacement, text) + if new_count_changes: + console.print( + f"[yellow] Performed {new_count_changes} replacements " + f"of {REPLACEMENTS[index].description}[/]: {path}" + ) + count_changes += new_count_changes + if count_changes: + path.write_text(text) + total_count_changes += count_changes + return 1 if total_count_changes else 0 + + +sys.exit(main()) diff --git a/tests/system/providers/papermill/input_notebook.ipynb b/tests/system/providers/papermill/input_notebook.ipynb index eb73f825e9f7c..d450712435bec 100644 --- a/tests/system/providers/papermill/input_notebook.ipynb +++ b/tests/system/providers/papermill/input_notebook.ipynb @@ -63,7 +63,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Inside the notebook you can save data by calling the glue function. Then later you can read the results of that notebook by “scrap” name (see the Airflow Papermill example DAG)" + "Inside the notebook you can save data by calling the glue function. Then later you can read the results of that notebook by 'scrap' name (see the Airflow Papermill example DAG)" ] }, {