diff --git a/.gitignore b/.gitignore index b6e4761..93e82b4 100644 --- a/.gitignore +++ b/.gitignore @@ -127,3 +127,10 @@ dmypy.json # Pyre type checker .pyre/ + +#vsCode config +.vscode/ + +#tests artefact +test.pdf +examples/.adastop_comparator.pkl diff --git a/adastop/cli.py b/adastop/cli.py index 1668420..d812e60 100644 --- a/adastop/cli.py +++ b/adastop/cli.py @@ -61,10 +61,9 @@ def compare(ctx, input_file, n_groups, size_group, n_permutations, alpha, beta, if i in comparator.current_comparisons.ravel(): names.append(comparator.agent_names[i]) - Z = [np.hstack([comparator.eval_values[agent], df[agent]]) for agent in names] - if len(Z[0]) > comparator.K * n_fits_per_group: - raise ValueError('Error: you tried to use more group than what was initially declared, this is not allowed by the theory.') + if len(names) == 0: + raise ValueError('Error: you tried to use more group than necessary. Use adastop status to see current status for more info.') assert "continue" in list(comparator.decisions.values()), "Test finished at last iteration." else: diff --git a/adastop/compare_agents.py b/adastop/compare_agents.py index a32d743..40a97ea 100644 --- a/adastop/compare_agents.py +++ b/adastop/compare_agents.py @@ -212,7 +212,7 @@ def partial_compare(self, eval_values, verbose=True): if self.agent_names is None: self.agent_names = list(eval_values.keys()) - Z = [eval_values[agent] for agent in self.agent_names] + Z = [np.array(eval_values[agent]) for agent in self.agent_names] n_managers = len(Z) if isinstance(self.n,int): self.n = np.array([self.n]*n_managers) @@ -256,13 +256,13 @@ def partial_compare(self, eval_values, verbose=True): # Compute admissible values, i.e. values that would not be rejected nor accepted. admissible_values_sup = values[ - self.level_spent + icumulative_probas <= clevel + self.level_spent + icumulative_probas < clevel ] if len(admissible_values_sup) > 0: bk_sup = admissible_values_sup[0] # the minimum admissible value level_to_add = icumulative_probas[ - self.level_spent + icumulative_probas <= clevel + self.level_spent + icumulative_probas < clevel ][0] else: # This case is possible if clevel-self.level_spent <= 1/ self.normalization (smallest proba possible), @@ -272,7 +272,7 @@ def partial_compare(self, eval_values, verbose=True): cumulative_probas = np.arange(len(values)) / self.normalization # corresponds to P(T < t) admissible_values_inf = values[ - self.power_spent + cumulative_probas < dlevel + self.power_spent + cumulative_probas <= dlevel ] if len(admissible_values_inf) > 0: diff --git a/docs/tutorials.md b/docs/tutorials.md index 233eac5..cde9077 100644 --- a/docs/tutorials.md +++ b/docs/tutorials.md @@ -15,6 +15,17 @@ The command line interface takes csv files as input. Each csv file must contain Below, we give an example based on files containing the evaluations of PPO,DDPG,SAC,TRPO, four Deep Reinforcement Learning algorithmes, given in the \`examples\` directory of the main repository. +## Installation + +To install adastop, use pip: +```bash +pip install adastop +``` + +This will automatically install the command line interface as well as the python library. + + + ## Help for cli tool The AdaStop algorithm is initialized with the first test done through \`adastop compare\` and the current state of AdaStop is then saved in a pickle file. The help of \`adastop\` command line can be obtained with the following: @@ -90,7 +101,7 @@ The input format of adastop is under the form of a csv file containing the score Let us launch AdaStop on this first batch of data. -First, we clean up the corrent directory of any litter files that could have been spawned by a previous usage of \`adastop\` (if you never used \`adastop\` before, this command will not have any effect). +First, we clean up the current directory of any litter files that could have been spawned by a previous usage of \`adastop\` (if you never used \`adastop\` before, this command will not have any effect). ```bash adastop reset . # reset the state of the comparator (remove hidden pickle file) @@ -144,14 +155,14 @@ adastop compare --n-groups 5 --size-group 5 walker5.csv Test is finished, decisions are -| | Agent1 vs Agent2 | mean Agent1 | mean Agent2 | mean diff | std Agent 1 | std Agent 2 | decisions | -|--- |---------------- |----------- |----------- |--------- |----------- |----------- |--------- | -| 0 | PPO vs DDPG | 2901.53 | 884.119 | 2017.41 | 1257.93 | 535.74 | larger | -| 0 | PPO vs SAC | 2901.53 | 4543.4 | -1641.87 | 1257.93 | 432.13 | smaller | -| 0 | PPO vs TRPO | 2901.53 | 1215.42 | 1686.11 | 1257.93 | 529.672 | larger | -| 0 | DDPG vs SAC | 884.119 | 4543.4 | -3659.28 | 535.74 | 432.13 | smaller | -| 0 | DDPG vs TRPO | 884.119 | 1215.42 | -331.297 | 535.74 | 529.672 | smaller | -| 0 | SAC vs TRPO | 4543.4 | 1215.42 | 3327.98 | 432.13 | 529.672 | larger | +| Agent1 vs Agent2 | mean Agent1 | mean Agent2 | mean diff | std Agent 1 | std Agent 2 | decisions | +|----------------- |------------ |------------ |---------- |------------ |------------ |---------- | +| PPO vs DDPG | 2901.53 | 884.119 | 2017.41 | 1257.93 | 535.74 | larger | +| PPO vs SAC | 2901.53 | 4543.4 | -1641.87 | 1257.93 | 432.13 | smaller | +| PPO vs TRPO | 2901.53 | 1215.42 | 1686.11 | 1257.93 | 529.672 | larger | +| DDPG vs SAC | 884.119 | 4543.4 | -3659.28 | 535.74 | 432.13 | smaller | +| DDPG vs TRPO | 884.119 | 1215.42 | -331.297 | 535.74 | 529.672 | smaller | +| SAC vs TRPO | 4543.4 | 1215.42 | 3327.98 | 432.13 | 529.672 | larger | Comparator Saved diff --git a/docs/tutorials.org b/docs/tutorials.org index 77444ab..db09d1b 100644 --- a/docs/tutorials.org +++ b/docs/tutorials.org @@ -15,6 +15,21 @@ Please note that if, in the process of the algorithm, all the comparisons for on Below, we give an example based on files containing the evaluations of PPO,DDPG,SAC,TRPO, four Deep Reinforcement Learning algorithmes, given in the =examples= directory of the main repository. + + + +** Installation + +To install adastop, use pip: + +#+begin_src bash :session *shell* :results verbatim :exports both +pip install adastop +#+end_src + +This will automatically install the command line interface as well as the python library. + + + ** Help for cli tool The AdaStop algorithm is initialized with the first test done through =adastop compare= and the current state of AdaStop is then saved in a pickle file. The help of =adastop= command line can be obtained with the following: @@ -47,7 +62,7 @@ The input format of adastop is under the form of a csv file containing the score Let us launch AdaStop on this first batch of data. -First, we clean up the corrent directory of any litter files that could have been spawned by a previous usage of =adastop= (if you never used =adastop= before, this command will not have any effect). +First, we clean up the current directory of any litter files that could have been spawned by a previous usage of =adastop= (if you never used =adastop= before, this command will not have any effect). #+begin_src bash :session *shell* :results verbatim :exports both adastop reset . # reset the state of the comparator (remove hidden pickle file) diff --git a/docs/user_guide.md b/docs/user_guide.md index 8e871f2..30a7294 100644 --- a/docs/user_guide.md +++ b/docs/user_guide.md @@ -60,7 +60,7 @@ Then, once you did the comparison on the first file, you can use iteratively `ad #### Choice of comparisons -In adastopn, one can choose which comparisons are done. The default is to do all the pairwise comparisons between two algorithms. In practice, it is sometimes sufficient to compare to only one of them, a benchmark, for this the `--compare-to-first` argument can be used. For a more fine-grained control on which comparison to do, the python API can take the comparisons as input. +In adastop, one can choose which comparisons are done. The default is to do all the pairwise comparisons between two algorithms. In practice, it is sometimes sufficient to compare to only one of them, a benchmark, for this the `--compare-to-first` argument can be used. For a more fine-grained control on which comparison to do, the python API can take the comparisons as input. **Remark**: it is not statistically ok to execute adastop several times and interpret the result as though it was only one test, if adastop is run several times this is multiple testing and some calibration has to be done. Instead, it is better to do all the comparisons at the same time, running the adastop algorithm only once, and adastop will handle the multiplicity of hypotheses by itself. diff --git a/tests/test_cli.py b/tests/test_cli.py index 5fed177..08e6afd 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,6 +1,7 @@ import pytest from click.testing import CliRunner from adastop.cli import adastop +import os # we reuse a bit of pytest's own testing machinery, this should eventually come import subprocess @@ -8,23 +9,49 @@ def test_cli(): runner = CliRunner() + test_pdf_path = "test.pdf" + + if os.path.exists(test_pdf_path): + os.remove(test_pdf_path) + result = runner.invoke(adastop, ['reset', 'examples']) assert result.exit_code == 0 for j in range(1,6): - - result = runner.invoke(adastop, ['compare', 'examples/walker'+str(j)+'.csv']) + result = runner.invoke(adastop, ['compare', "--seed", "1", 'examples/walker'+str(j)+'.csv']) assert result.exit_code == 0 - result = runner.invoke(adastop, ['compare', 'examples/walker3.csv']) + result = runner.invoke(adastop, ['compare',"--seed", "1", 'examples/walker3.csv']) assert result.exit_code == 1 + assert result.exception.args[0] == 'Error: you tried to use more group than necessary. Use adastop status to see current status for more info.' - result = runner.invoke(adastop, ['plot', 'examples', "test.pdf"]) + + result = runner.invoke(adastop, ['plot', 'examples', test_pdf_path]) assert result.exit_code == 0 result = runner.invoke(adastop, ['status', 'examples']) assert result.exit_code == 0 + assert os.path.exists(test_pdf_path) == True + result = runner.invoke(adastop, ['reset', 'examples']) assert result.exit_code == 0 - result = runner.invoke(adastop, ['compare', "--compare-to-first", 'examples/walker1.csv']) + result = runner.invoke(adastop, ['compare', "--compare-to-first","--seed", "1", 'examples/walker1.csv']) assert result.exit_code == 0 + + + +def test_plot_no_comparator_save_file(): + runner = CliRunner() + runner.invoke(adastop, ['reset', 'examples']) + + result = runner.invoke(adastop, ['plot', 'examples', "test.pdf"]) + assert result.exit_code == 1 + assert result.exception.args[0] == 'Comparator save file not found.' + +def test_status_no_comparator_save_file(): + runner = CliRunner() + runner.invoke(adastop, ['reset', 'examples']) + + result = runner.invoke(adastop, ['status', 'examples']) + assert result.exit_code == 1 + assert result.exception.args[0] == 'Comparator save file not found.' diff --git a/tests/test_error_toy.py b/tests/test_compare_agents.py similarity index 71% rename from tests/test_error_toy.py rename to tests/test_compare_agents.py index 6df8bff..2998d89 100644 --- a/tests/test_error_toy.py +++ b/tests/test_compare_agents.py @@ -5,16 +5,27 @@ B = 5000 alpha = 0.05 n_runs = 10 +seed = 42 -def test_runtime(): +def test_partial_compare(): + rng = np.random.RandomState(seed) idxs = [] comparator = MultipleAgentsComparator(n=3, K=3, B=B, alpha=alpha, seed=42, beta = 0.01) - evals = {"Agent "+str(k):np.random.normal(size=3) for k in range(3)} + evals = {"Agent "+str(k): rng.normal(size=3) for k in range(3)} comparator.partial_compare(evals) + + +def test_partial_compare_not_enough_points(): + comparator = MultipleAgentsComparator(n=3, K=3, B=5000, alpha=-1e-5, seed=42, beta = 0.01) + evals = {"Agent 1":np.array([0,0,0]),"Agent 2":np.array([0,0,0]),"Agent 3":np.array([0,0,0])} + comparator.partial_compare(evals) + @pytest.mark.parametrize("K,n", [(10,2),(5,3), (3, 5), (1, 15)]) def test_type1(K,n): + rng = np.random.RandomState(seed) + idxs = [] n_agents = 3 for M in range(n_runs): @@ -23,9 +34,9 @@ def test_type1(K,n): while not comparator.is_finished: if len(evals) >0: for k in range(n_agents): - evals["Agent "+str(k)] = np.hstack([evals["Agent "+str(k)] ,np.random.normal(size=n)]) + evals["Agent "+str(k)] = np.hstack([evals["Agent "+str(k)] , rng.normal(size=n)]) else: - evals = {"Agent "+str(k): np.random.normal(size=n) for k in range(n_agents)} + evals = {"Agent "+str(k): rng.normal(size=n) for k in range(n_agents)} comparator.partial_compare(evals) idxs.append(not("equal" in comparator.decisions.values())) print(comparator.get_results()) @@ -33,6 +44,8 @@ def test_type1(K,n): @pytest.mark.parametrize("K,n", [(5,3), (3, 5), (1, 15)]) def test_type1_large_beta(K,n): + rng = np.random.RandomState(seed) + idxs = [] n_agents = 3 for M in range(n_runs): @@ -41,9 +54,9 @@ def test_type1_large_beta(K,n): while not comparator.is_finished: if len(evals) >0: for k in range(n_agents): - evals["Agent "+str(k)] = np.hstack([evals["Agent "+str(k)] ,np.random.normal(size=n)]) + evals["Agent "+str(k)] = np.hstack([evals["Agent "+str(k)] , rng.normal(size=n)]) else: - evals = {"Agent "+str(k): np.random.normal(size=n) for k in range(n_agents)} + evals = {"Agent "+str(k): rng.normal(size=n) for k in range(n_agents)} comparator.partial_compare(evals) idxs.append(not("equal" in comparator.decisions.values())) print(comparator.get_results()) @@ -51,6 +64,8 @@ def test_type1_large_beta(K,n): @pytest.mark.parametrize("K,n", [(3, 5), (1, 15)]) def test_type2(K,n): + rng = np.random.RandomState(seed) + idxs = [] n_agents = 2 for M in range(n_runs): @@ -59,9 +74,9 @@ def test_type2(K,n): while not comparator.is_finished: if len(evals) >0: for k in range(n_agents): - evals["Agent "+str(k)] = np.hstack([evals["Agent "+str(k)] ,np.random.normal(size=n)+2*k]) + evals["Agent "+str(k)] = np.hstack([evals["Agent "+str(k)] , rng.normal(size=n)+2*k]) else: - evals = {"Agent "+str(k): np.random.normal(size=n)+2*k for k in range(n_agents)} + evals = {"Agent "+str(k): rng.normal(size=n)+2*k for k in range(n_agents)} comparator.partial_compare(evals) idxs.append(not("equal" in comparator.decisions.values())) assert np.mean(idxs) > 0.3, "type 2 error seems to be too large." diff --git a/tests/test_plot.py b/tests/test_plot.py index 91e9fb9..e36303d 100644 --- a/tests/test_plot.py +++ b/tests/test_plot.py @@ -76,3 +76,39 @@ def test_plot_sota_noteq(): # plt.savefig('fig2.pdf') fig, axes= plt.subplots(1,2) comparator.plot_results_sota(axes=axes) + + + +def test_plot_noteq2(): + n_agents = 3 + comparator = MultipleAgentsComparator(n=10, K=K, B=B, alpha=alpha, seed=42, beta = 0.01) + evals = {} + while not comparator.is_finished: + if len(evals) >0: + for k in range(n_agents): + evals["Agent "+str(k)] = np.hstack([evals["Agent "+str(k)] , np.abs(2*K-k)+np.random.normal(size=10)]) + else: + evals = {"Agent "+str(k): np.random.normal(size=10)+np.abs(2*K-k) for k in range(n_agents)} + comparator.partial_compare(evals) + # plt.savefig('fig2.pdf') + fig, axes= plt.subplots(1,2) + comparator.plot_results(axes=axes) + +def test_plot_sota_noteq2(): + n_agents = 3 + comparisons = np.array([(0,i) for i in [1,2]]) + comparator = MultipleAgentsComparator(n=10, K=K, B=B, alpha=alpha, + comparisons=comparisons, seed=42, beta = 0.01) + evals = {} + while not comparator.is_finished: + if len(evals) >0: + for k in range(n_agents): + evals["Agent "+str(k)] = np.hstack([evals["Agent "+str(k)] ,np.random.normal(size=10)+np.abs(2*K-k)]) + else: + evals = {"Agent "+str(k): np.random.normal(size=10)+np.abs(2*K-k) for k in range(n_agents)} + comparator.partial_compare(evals) + comparator.plot_results_sota() + # plt.savefig('fig2.pdf') + fig, axes= plt.subplots(1,2) + comparator.plot_results_sota(axes=axes) +