diff --git a/Modeling eMNS/Generative_model_ETH_v2.ipynb b/Modeling eMNS/Generative_model_ETH_v2.ipynb index bc9c12e..d90b528 100644 --- a/Modeling eMNS/Generative_model_ETH_v2.ipynb +++ b/Modeling eMNS/Generative_model_ETH_v2.ipynb @@ -7,6 +7,15 @@ "### Train ETH data to CNN generative network" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install -U \"ray[data,train,tune,serve]\"" + ] + }, { "cell_type": "code", "execution_count": null, @@ -90,12 +99,36 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/tmp/ipykernel_44369/275140010.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 128\u001b[0m )\n\u001b[1;32m 129\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 130\u001b[0;31m \u001b[0mresults\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtuner\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m/home/vipuser/anaconda3/lib/python3.9/site-packages/ray/tune/tuner.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 379\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_is_ray_client\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 380\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 381\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_local_tuner\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 382\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mTuneError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 383\u001b[0m raise TuneError(\n", + "\u001b[0;32m/home/vipuser/anaconda3/lib/python3.9/site-packages/ray/tune/impl/tuner_internal.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 507\u001b[0m \u001b[0mparam_space\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdeepcopy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparam_space\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 508\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_is_restored\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 509\u001b[0;31m \u001b[0manalysis\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_fit_internal\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrainable\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparam_space\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 510\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 511\u001b[0m \u001b[0manalysis\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_fit_resume\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrainable\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparam_space\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/home/vipuser/anaconda3/lib/python3.9/site-packages/ray/tune/impl/tuner_internal.py\u001b[0m in \u001b[0;36m_fit_internal\u001b[0;34m(self, trainable, param_space)\u001b[0m\n\u001b[1;32m 626\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_tuner_kwargs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 627\u001b[0m }\n\u001b[0;32m--> 628\u001b[0;31m analysis = run(\n\u001b[0m\u001b[1;32m 629\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 630\u001b[0m )\n", + "\u001b[0;32m/home/vipuser/anaconda3/lib/python3.9/site-packages/ray/tune/tune.py\u001b[0m in \u001b[0;36mrun\u001b[0;34m(run_or_experiment, name, metric, mode, stop, time_budget_s, config, resources_per_trial, num_samples, storage_path, storage_filesystem, search_alg, scheduler, checkpoint_config, verbose, progress_reporter, log_to_file, trial_name_creator, trial_dirname_creator, sync_config, export_formats, max_failures, fail_fast, restore, resume, reuse_actors, raise_on_failed_trial, callbacks, max_concurrent_trials, keep_checkpoints_num, checkpoint_score_attr, checkpoint_freq, checkpoint_at_end, chdir_to_trial_dir, local_dir, _remote, _remote_string_queue, _entrypoint)\u001b[0m\n\u001b[1;32m 1013\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1014\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1015\u001b[0;31m \u001b[0mrunner\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcheckpoint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mforce\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwait\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1016\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1017\u001b[0m \u001b[0mlogger\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwarning\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"Trial Runner checkpointing failed: {str(e)}\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/home/vipuser/anaconda3/lib/python3.9/site-packages/ray/tune/execution/tune_controller.py\u001b[0m in \u001b[0;36mcheckpoint\u001b[0;34m(self, force, wait)\u001b[0m\n\u001b[1;32m 476\u001b[0m \u001b[0mdisable\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_checkpoint_manager\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mauto_checkpoint_enabled\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mforce\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mwait\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 477\u001b[0m ):\n\u001b[0;32m--> 478\u001b[0;31m self._checkpoint_manager.checkpoint(\n\u001b[0m\u001b[1;32m 479\u001b[0m \u001b[0msave_fn\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msave_to_dir\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mforce\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mforce\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwait\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mwait\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 480\u001b[0m )\n", + "\u001b[0;32m/home/vipuser/anaconda3/lib/python3.9/site-packages/ray/tune/execution/experiment_state.py\u001b[0m in \u001b[0;36mcheckpoint\u001b[0;34m(self, save_fn, force, wait)\u001b[0m\n\u001b[1;32m 222\u001b[0m \u001b[0;31m# This context will serialize the dataset execution plan instead, if available.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 223\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mout_of_band_serialize_dataset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 224\u001b[0;31m \u001b[0msave_fn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 225\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 226\u001b[0m \u001b[0;31m# Sync to cloud\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/home/vipuser/anaconda3/lib/python3.9/site-packages/ray/tune/execution/tune_controller.py\u001b[0m in \u001b[0;36msave_to_dir\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 368\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 369\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtmp_file_name\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"w\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 370\u001b[0;31m \u001b[0mjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdump\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrunner_state\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindent\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcls\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mTuneFunctionEncoder\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 371\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 372\u001b[0m os.replace(\n", + "\u001b[0;32m/home/vipuser/anaconda3/lib/python3.9/json/__init__.py\u001b[0m in \u001b[0;36mdump\u001b[0;34m(obj, fp, skipkeys, ensure_ascii, check_circular, allow_nan, cls, indent, separators, default, sort_keys, **kw)\u001b[0m\n\u001b[1;32m 177\u001b[0m \u001b[0;31m# could accelerate with writelines in some versions of Python, at\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 178\u001b[0m \u001b[0;31m# a debuggability cost\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 179\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mchunk\u001b[0m \u001b[0;32min\u001b[0m \u001b[0miterable\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 180\u001b[0m \u001b[0mfp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchunk\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 181\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/home/vipuser/anaconda3/lib/python3.9/json/encoder.py\u001b[0m in \u001b[0;36m_iterencode\u001b[0;34m(o, _current_indent_level)\u001b[0m\n\u001b[1;32m 429\u001b[0m \u001b[0;32myield\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0m_iterencode_list\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mo\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_current_indent_level\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 430\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mo\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdict\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 431\u001b[0;31m \u001b[0;32myield\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0m_iterencode_dict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mo\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_current_indent_level\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 432\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 433\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mmarkers\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/home/vipuser/anaconda3/lib/python3.9/json/encoder.py\u001b[0m in \u001b[0;36m_iterencode_dict\u001b[0;34m(dct, _current_indent_level)\u001b[0m\n\u001b[1;32m 403\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 404\u001b[0m \u001b[0mchunks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_iterencode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_current_indent_level\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 405\u001b[0;31m \u001b[0;32myield\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mchunks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 406\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mnewline_indent\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 407\u001b[0m \u001b[0m_current_indent_level\u001b[0m \u001b[0;34m-=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/home/vipuser/anaconda3/lib/python3.9/json/encoder.py\u001b[0m in \u001b[0;36m_iterencode_list\u001b[0;34m(lst, _current_indent_level)\u001b[0m\n\u001b[1;32m 323\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 324\u001b[0m \u001b[0mchunks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_iterencode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_current_indent_level\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 325\u001b[0;31m \u001b[0;32myield\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mchunks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 326\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mnewline_indent\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 327\u001b[0m \u001b[0m_current_indent_level\u001b[0m \u001b[0;34m-=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/home/vipuser/anaconda3/lib/python3.9/json/encoder.py\u001b[0m in \u001b[0;36m_iterencode_list\u001b[0;34m(lst, _current_indent_level)\u001b[0m\n\u001b[1;32m 300\u001b[0m \u001b[0mbuf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mseparator\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 301\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 302\u001b[0;31m \u001b[0;32myield\u001b[0m \u001b[0mbuf\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0m_encoder\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 303\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mvalue\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 304\u001b[0m \u001b[0;32myield\u001b[0m \u001b[0mbuf\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m'null'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], "source": [ "from Neural_network import Generative_net, Generative_net_test, ResidualEMNSBlock_3d, BigBlock, weight_init, eMNS_Dataset\n", - "from Training_loop_v2 import train_GM\n", + "from Training_loop_v2 import train_GM, train_GM_ray\n", "from functools import partial\n", "from ray.train import RunConfig, ScalingConfig, CheckpointConfig\n", "from ray.train.torch import TorchTrainer\n", @@ -124,9 +157,9 @@ " )\n", "param_space = {\n", " \"scaling_config\": ScalingConfig(\n", - " num_workers = 1,\n", + " num_workers = 3,\n", " use_gpu = use_gpu,\n", - " resources_per_worker = {\"CPU\":10, \"GPU\":2}\n", + " # resources_per_worker = {\"CPU\":4, \"GPU\":1}\n", " ),\n", " # You can even grid search various datasets in Tune.\n", " # \"datasets\": {\n", @@ -135,13 +168,13 @@ " # ),\n", " # },\n", " \"train_loop_config\": {\n", - " 'epochs': tune.choice([350]),\n", - " 'lr_max': tune.loguniform(1e-4,1e-2),\n", - " 'lr_min': tune.loguniform(1e-5,1e-7),\n", - " 'batch_size': tune.choice([4,8,16]),\n", - " 'L2_norm' : tune.choice([0]),\n", + " 'epochs': 350,\n", + " 'lr_max': 1e-4,\n", + " 'lr_min': 2.5e-6,\n", + " 'batch_size': 8,\n", + " 'L2_norm' : 0,\n", " 'verbose': False,\n", - " 'DF' : tune.choice([True,False]),\n", + " 'DF' : False,\n", " 'schedule': [],\n", " 'grid_space': 16**3,\n", " 'learning_rate_decay': 0.5,\n", @@ -251,7 +284,7 @@ "metadata": {}, "outputs": [], "source": [ - "plot_ray_results(results, metrics_names=['rmse_train','rmse_val'],ylim=[20,50])" + "plot_ray_results(results, metrics_names=['rmse_train','rmse_val'],ylim=[3,50])" ] }, { @@ -280,13 +313,13 @@ ")\n", "\n", "config = {\n", - " 'epochs': 10,\n", + " 'epochs': 350,\n", " 'lr_max': 1e-4,\n", " 'lr_min': 2.5e-6,\n", " 'batch_size': 8,\n", " 'L2_norm' : 0,\n", " 'verbose': False,\n", - " 'DF' : True,\n", + " 'DF' : False,\n", " 'schedule': [],\n", " 'grid_space': 16**3,\n", " 'learning_rate_decay': 0.5,\n", diff --git a/Modeling eMNS/Training_loop_v2.py b/Modeling eMNS/Training_loop_v2.py index f27233d..de81176 100644 --- a/Modeling eMNS/Training_loop_v2.py +++ b/Modeling eMNS/Training_loop_v2.py @@ -5,7 +5,7 @@ from torch.nn.parallel import DistributedDataParallel import torch.nn.functional as F from early_stopping import EarlyStopping, EarlyDecay -from utils import compute_discrete_curl, denorm, max_min_norm +from utils import compute_discrete_curl, denorm, max_min_norm, denorm_ray from Neural_network import ResidualEMNSBlock_3d, BigBlock, Generative_net import numpy as np from ray import train, tune @@ -69,7 +69,7 @@ def adjust_learning_rate_linear(optimizer, linear_increment): ###################################################################################################################################### -# def train_part_GM(model,optimizer,train_loader,valid_loader, epochs = 1, learning_rate_decay =.1,weight_decay=1e-4, schedule=[], grid_space= 20*20*20, DF= False, verbose=True, device= 'cuda',maxB=[],minB=[], lr_max=1e-4, lr_min=2.5e-6,max_epoch=200, linear_lr=False): + def train_GM(config): """ Train a model using torch API @@ -100,7 +100,7 @@ def train_GM(config): device = config['device'] train_set = config['train_set'] valid_set = config['valid_set'] - + #################################################### #--------------model construction------------------ #################################################### @@ -113,10 +113,20 @@ def train_GM(config): model = Generative_net(SB_args, BB_args, SB_block, BB_block, num_input=num_input, output_shape= output_shape) - model = model.to(device=device) - # prepare model for training - # model = train.torch.prepare_model(model) + + + # #################################################### + # #---------------GPU parallel----------------------- + # #################################################### + if torch.cuda.device_count() > 1: + model = torch.nn.DataParallel(model) + if device == 'cuda': + device = 'cuda:'+str(torch.cuda.current_device()) + model.to(device) + print(device) + # # prepare model for training + # model = train.torch.prepare_model(model) ##################################################### #-------------------data loader---------------------- ##################################################### @@ -236,6 +246,179 @@ def train_GM(config): + return rmse_history, rmse_val_history,loss_history, iter_history,mse_history, mse_val_history,epoch_stop,Rsquare +#------------------------------------------------------------------------------------------------------- +def train_GM_ray(config): + """ + Train a model using torch API + + Inputs: + - model: A Pytorch Module giving the model to train + - optimizer: An optimizer object we will use to train the model + - epochs: A Python integer giving the number of epochs to train for + + Returns: model accuracies, prints model loss during training + """ + #---------------unpack config--------------------- + # print(config) + batch_size = config['batch_size'] + epochs = config["epochs"] + verbose = config['verbose'] + lr_max = config['lr_max'] + lr_min = config['lr_min'] + DF = config['DF'] # whether using divergence free model + grid_space = config['grid_space'] + schedule = config['schedule'] + learning_rate_decay = config['learning_rate_decay'] + maxB = config['maxB'] + minB = config['minB'] + skip_spacing = config['skip_spacing'] + num_repeat = config['num_repeat'] + num_block = config['num_block'] + device = config['device'] + train_set = config['train_set'] + valid_set = config['valid_set'] + + #################################################### + #--------------model construction------------------ + #################################################### + num_input = 8 + output_shape = (3,16,16,16) + SB_args = (64,64,skip_spacing,num_repeat) # (Cin, Cout, skip_spacing, num_repeat) + BB_args = (2,num_block) # (scale_factor, num_block) + SB_block = ResidualEMNSBlock_3d + BB_block = BigBlock + + + model = Generative_net(SB_args, BB_args, SB_block, BB_block, num_input=num_input, output_shape= output_shape) + + + + # #################################################### + # #---------------GPU parallel----------------------- + # #################################################### + # if torch.cuda.device_count() > 1: + # model = torch.nn.DataParallel(model) + # device = torch.cuda.current_device + # prepare model for training + model = train.torch.prepare_model(model) + ##################################################### + #-------------------data loader---------------------- + ##################################################### + + train_loader = torch.utils.data.DataLoader(dataset=train_set,batch_size=config['batch_size'],shuffle=True) + valid_loader = torch.utils.data.DataLoader(dataset=valid_set,batch_size=config['batch_size'],shuffle=True) + + ##################################################### + #-------------------optimizer-------------------------- + ##################################################### + + optimizer = torch.optim.Adam( + [{'params': model.parameters()}], + lr= config['lr_max'], + weight_decay= config['L2_norm'], + betas=(0.5,0.99)) + + #------------------------------------------------------ + num_iters = epochs*len(train_loader) + print_every = 100 + adjust_epoch_count = 0 + if verbose: + num_prints = num_iters // print_every + 1 + else: + num_prints = epochs + + # initial loss history and iter history + rmse_history = torch.zeros(num_prints,dtype = torch.float) + rmse_val_history = torch.zeros(num_prints,dtype = torch.float) + iter_history = torch.zeros(num_prints,dtype = torch.float) + loss_history = torch.zeros(num_prints,dtype = torch.float) + mse_history= torch.zeros(num_prints,dtype = torch.float) + mse_val_history= torch.zeros(num_prints,dtype = torch.float) + + patience = 20 # 当验证集损失在连5次训练周期中都没有得到降低时,停止模型训练,以防止模型过拟合 + early_stopping = EarlyStopping(patience, verbose=True) + early_decay = EarlyDecay(patience, delta=0.005, lr_min=lr_min) + epoch_stop = 0 + + ########################################################### + # train loop: + # step 1: update learning rate + # step 2: put model to train model, move data to gpu + # step 3: compute scores, calculate loss function + # step 4: Zero out all of gradients for the variables which the optimizer will update + # step 5: compute gradient of loss, update parameters + ########################################################### + for epoch in range(epochs): + for t, (x,y) in enumerate(train_loader): + model.train() + + # x,_,_ = max_min_norm(x,device) + # y,_,_ = max_min_norm(y,device) + optimizer.zero_grad() #zero out all of gradient + if DF: + _, preds = Jacobian3(model(x)) + else: + preds = model(x) + # loss function in the paper "Modeling Electromagnetic Navigation Systems" + # loss= lamda_b*|y-preds| + lamda_g*| nabla(y) - nabla(preds)| + l1_loss = F.l1_loss(preds,y) + Grad_loss = grad_loss_Jacobain(preds,y) + loss = l1_loss + Grad_loss + loss.backward() # compute gradient of loss + optimizer.step() #update parameters + + tt = t + epoch*len(train_loader) +1 + adjust_learning_rate_cosine(optimizer, lr_max, lr_min, epochs, tt, len(train_loader)) + # early_decay(loss, optimizer, learning_rate_decay) + ########################################################### + # print loss during training + if verbose and (tt % print_every == 1 or (epoch == epochs -1 and t == len(train_loader) -1) ) : + print(f'Epoch {epoch:d}, Iteration {tt:d}, loss = {loss.item():.4f}, l1 loss={l1_loss.item():.4f}, grad loss={Grad_loss.item():.4f}') + rmse_val,mse_val,Rsquare = check_rmse_CNN_ray(valid_loader,model,grid_space, DF,maxB=maxB,minB=minB) + rmse,mse_train,R_TEMP = check_rmse_CNN_ray(train_loader,model, grid_space, DF,maxB=maxB,minB=minB) + rmse_val_history[tt//print_every] = rmse_val + rmse_history[tt // print_every] = rmse + iter_history[tt // print_every] = tt + loss_history[tt // print_every] = loss.item() + print() + + elif not verbose and (t == len(train_loader)-1): + print(f'Epoch {epoch:d}, Iteration {tt:d}, loss = {loss.item():.4f}, l1 loss={l1_loss.item():.4f}, grad loss={Grad_loss.item():.4f}') + rmse_val,mse_val,Rsquare= check_rmse_CNN_ray(valid_loader,model, grid_space,DF,maxB=maxB,minB=minB) + rmse,mse_train,R_TEMP = check_rmse_CNN_ray(train_loader,model, grid_space,DF,maxB=maxB,minB=minB) + rmse_val_history[epoch] = rmse_val + rmse_history[epoch] = rmse + iter_history[epoch] = tt + loss_history[epoch] = loss.item() + mse_history[epoch] = mse_train + mse_val_history[epoch] = mse_val + + print() + adjust_epoch_count += 1 + + # # create checkpoint + # base_model = (model.module + # if isinstance(model, DistributedDataParallel) else model) + # checkpoint_dir = tempfile.mkdtemp() + # # load back training state + # checkpoint_data = { + # "epoch": epoch, + # "net_state_dict": base_model.state_dict(), + # "optimizer_state_dict": optimizer.state_dict(), + # } + # torch.save(checkpoint_data, os.path.join(checkpoint_dir, "model.pt")) + # checkpoint = Checkpoint.from_directory(checkpoint_dir) + #Send the current training result back to Tune + train.report({'rmse_val':rmse_val.item(), 'rmse_train': rmse.item(), 'loss':loss.item()}) + + + + adjust_learning_rate_sch(optimizer, learning_rate_decay, epoch, schedule) + epoch_stop = epoch + + + return rmse_history, rmse_val_history,loss_history, iter_history,mse_history, mse_val_history,epoch_stop,Rsquare @@ -295,7 +478,61 @@ def check_rmse_CNN(dataloader,model, grid_space, device, DF, maxB=[],minB=[]): return rmse, mse_temp/num_samples/grid_space/3, Rsquare +#----------------------------------------------------------------- + +def get_mean_of_dataloader_ray(dataloader,model): + num_samples = 0 + b = torch.zeros(1) + model.eval() + for x,y in dataloader: + # use sum instead of mean, what do you think? + y_sum = y.sum(dim=0,keepdim=True) + num_samples += y.shape[0] + # print(y.shape[0]) + b =b+y_sum + return b/num_samples + +def check_rmse_CNN_ray(dataloader,model, grid_space, DF, maxB=[],minB=[]): + ''' + Check RMSE of CNN + ''' + mse_temp = 0 + R_temp=0 + Rsquare=0 + num_samples = 0 + # print(Bfield_mean) + + data = next(iter(dataloader)) + mean = data[0].mean() + + Bfield_mean=get_mean_of_dataloader_ray(dataloader,model) + + model.eval() # set model to evaluation model + + with torch.no_grad(): + for x,y in dataloader: + num_samples += x.shape[0] + if DF: + _, scores = Jacobian3(model(x)) + else: + scores = model(x) + + # compute mse and R2 by de-normalize data + mse_temp += F.mse_loss(1e3*denorm_ray(scores,maxB,minB), 1e3*denorm_ray(y,maxB,minB) ,reduction='sum') + R_temp += F.mse_loss(1e3*denorm_ray(Bfield_mean.expand_as(y),maxB,minB), 1e3*denorm_ray(y,maxB,minB), reduction='sum') + + + rmse = torch.sqrt(mse_temp/num_samples/grid_space/3) + + Rsquare=1-mse_temp/R_temp/num_samples + print(f'Got rmse {rmse}') + + + + + return rmse, mse_temp/num_samples/grid_space/3, Rsquare +#---------------------------------------------------------------- def grad_loss(preds, y): ''' preds, y shape: (batch, dimension, grid_x, grid_y, grid_z) diff --git a/Modeling eMNS/utils.py b/Modeling eMNS/utils.py index ba8ba36..0812754 100644 --- a/Modeling eMNS/utils.py +++ b/Modeling eMNS/utils.py @@ -62,6 +62,14 @@ def denorm(x_norm, Bmax, Bmin, device): x = 0.5*(x_norm+1)*(Bmax.expand_as(x_norm).to(device)-Bmin.expand_as(x_norm).to(device)) + Bmin.expand_as(x_norm).to(device) return x +def denorm_ray(x_norm, Bmax, Bmin): + ''' + This function de-normalize the max-min normalization + x = 0.5*(x_norm+1)*(Bmax-Bmin) - Bmin + ''' + x = 0.5*(x_norm+1)*(Bmax.expand_as(x_norm)-Bmin.expand_as(x_norm)) + Bmin.expand_as(x_norm) + return x + def max_min_norm(x,device): """