Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -127,3 +127,10 @@ dmypy.json

# Pyre type checker
.pyre/

#vsCode config
.vscode/

#tests artefact
test.pdf
examples/.adastop_comparator.pkl
5 changes: 2 additions & 3 deletions adastop/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,10 +61,9 @@ def compare(ctx, input_file, n_groups, size_group, n_permutations, alpha, beta,
if i in comparator.current_comparisons.ravel():
names.append(comparator.agent_names[i])


Z = [np.hstack([comparator.eval_values[agent], df[agent]]) for agent in names]
if len(Z[0]) > comparator.K * n_fits_per_group:
raise ValueError('Error: you tried to use more group than what was initially declared, this is not allowed by the theory.')
if len(names) == 0:
raise ValueError('Error: you tried to use more group than necessary. Use adastop status to see current status for more info.')
assert "continue" in list(comparator.decisions.values()), "Test finished at last iteration."

else:
Expand Down
8 changes: 4 additions & 4 deletions adastop/compare_agents.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ def partial_compare(self, eval_values, verbose=True):
if self.agent_names is None:
self.agent_names = list(eval_values.keys())

Z = [eval_values[agent] for agent in self.agent_names]
Z = [np.array(eval_values[agent]) for agent in self.agent_names]
n_managers = len(Z)
if isinstance(self.n,int):
self.n = np.array([self.n]*n_managers)
Expand Down Expand Up @@ -256,13 +256,13 @@ def partial_compare(self, eval_values, verbose=True):

# Compute admissible values, i.e. values that would not be rejected nor accepted.
admissible_values_sup = values[
self.level_spent + icumulative_probas <= clevel
self.level_spent + icumulative_probas < clevel
]

if len(admissible_values_sup) > 0:
bk_sup = admissible_values_sup[0] # the minimum admissible value
level_to_add = icumulative_probas[
self.level_spent + icumulative_probas <= clevel
self.level_spent + icumulative_probas < clevel
][0]
else:
# This case is possible if clevel-self.level_spent <= 1/ self.normalization (smallest proba possible),
Expand All @@ -272,7 +272,7 @@ def partial_compare(self, eval_values, verbose=True):

cumulative_probas = np.arange(len(values)) / self.normalization # corresponds to P(T < t)
admissible_values_inf = values[
self.power_spent + cumulative_probas < dlevel
self.power_spent + cumulative_probas <= dlevel
]

if len(admissible_values_inf) > 0:
Expand Down
29 changes: 20 additions & 9 deletions docs/tutorials.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,17 @@ The command line interface takes csv files as input. Each csv file must contain
Below, we give an example based on files containing the evaluations of PPO,DDPG,SAC,TRPO, four Deep Reinforcement Learning algorithmes, given in the \`examples\` directory of the main repository.


## Installation

To install adastop, use pip:
```bash
pip install adastop
```

This will automatically install the command line interface as well as the python library.



## Help for cli tool

The AdaStop algorithm is initialized with the first test done through \`adastop compare\` and the current state of AdaStop is then saved in a pickle file. The help of \`adastop\` command line can be obtained with the following:
Expand Down Expand Up @@ -90,7 +101,7 @@ The input format of adastop is under the form of a csv file containing the score

Let us launch AdaStop on this first batch of data.

First, we clean up the corrent directory of any litter files that could have been spawned by a previous usage of \`adastop\` (if you never used \`adastop\` before, this command will not have any effect).
First, we clean up the current directory of any litter files that could have been spawned by a previous usage of \`adastop\` (if you never used \`adastop\` before, this command will not have any effect).

```bash
adastop reset . # reset the state of the comparator (remove hidden pickle file)
Expand Down Expand Up @@ -144,14 +155,14 @@ adastop compare --n-groups 5 --size-group 5 walker5.csv

Test is finished, decisions are

| | Agent1 vs Agent2 | mean Agent1 | mean Agent2 | mean diff | std Agent 1 | std Agent 2 | decisions |
|--- |---------------- |----------- |----------- |--------- |----------- |----------- |--------- |
| 0 | PPO vs DDPG | 2901.53 | 884.119 | 2017.41 | 1257.93 | 535.74 | larger |
| 0 | PPO vs SAC | 2901.53 | 4543.4 | -1641.87 | 1257.93 | 432.13 | smaller |
| 0 | PPO vs TRPO | 2901.53 | 1215.42 | 1686.11 | 1257.93 | 529.672 | larger |
| 0 | DDPG vs SAC | 884.119 | 4543.4 | -3659.28 | 535.74 | 432.13 | smaller |
| 0 | DDPG vs TRPO | 884.119 | 1215.42 | -331.297 | 535.74 | 529.672 | smaller |
| 0 | SAC vs TRPO | 4543.4 | 1215.42 | 3327.98 | 432.13 | 529.672 | larger |
| Agent1 vs Agent2 | mean Agent1 | mean Agent2 | mean diff | std Agent 1 | std Agent 2 | decisions |
|----------------- |------------ |------------ |---------- |------------ |------------ |---------- |
| PPO vs DDPG | 2901.53 | 884.119 | 2017.41 | 1257.93 | 535.74 | larger |
| PPO vs SAC | 2901.53 | 4543.4 | -1641.87 | 1257.93 | 432.13 | smaller |
| PPO vs TRPO | 2901.53 | 1215.42 | 1686.11 | 1257.93 | 529.672 | larger |
| DDPG vs SAC | 884.119 | 4543.4 | -3659.28 | 535.74 | 432.13 | smaller |
| DDPG vs TRPO | 884.119 | 1215.42 | -331.297 | 535.74 | 529.672 | smaller |
| SAC vs TRPO | 4543.4 | 1215.42 | 3327.98 | 432.13 | 529.672 | larger |

Comparator Saved

Expand Down
17 changes: 16 additions & 1 deletion docs/tutorials.org
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,21 @@ Please note that if, in the process of the algorithm, all the comparisons for on

Below, we give an example based on files containing the evaluations of PPO,DDPG,SAC,TRPO, four Deep Reinforcement Learning algorithmes, given in the =examples= directory of the main repository.




** Installation

To install adastop, use pip:

#+begin_src bash :session *shell* :results verbatim :exports both
pip install adastop
#+end_src

This will automatically install the command line interface as well as the python library.



** Help for cli tool

The AdaStop algorithm is initialized with the first test done through =adastop compare= and the current state of AdaStop is then saved in a pickle file. The help of =adastop= command line can be obtained with the following:
Expand Down Expand Up @@ -47,7 +62,7 @@ The input format of adastop is under the form of a csv file containing the score

Let us launch AdaStop on this first batch of data.

First, we clean up the corrent directory of any litter files that could have been spawned by a previous usage of =adastop= (if you never used =adastop= before, this command will not have any effect).
First, we clean up the current directory of any litter files that could have been spawned by a previous usage of =adastop= (if you never used =adastop= before, this command will not have any effect).

#+begin_src bash :session *shell* :results verbatim :exports both
adastop reset . # reset the state of the comparator (remove hidden pickle file)
Expand Down
2 changes: 1 addition & 1 deletion docs/user_guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ Then, once you did the comparison on the first file, you can use iteratively `ad

#### Choice of comparisons

In adastopn, one can choose which comparisons are done. The default is to do all the pairwise comparisons between two algorithms. In practice, it is sometimes sufficient to compare to only one of them, a benchmark, for this the `--compare-to-first` argument can be used. For a more fine-grained control on which comparison to do, the python API can take the comparisons as input.
In adastop, one can choose which comparisons are done. The default is to do all the pairwise comparisons between two algorithms. In practice, it is sometimes sufficient to compare to only one of them, a benchmark, for this the `--compare-to-first` argument can be used. For a more fine-grained control on which comparison to do, the python API can take the comparisons as input.

**Remark**: it is not statistically ok to execute adastop several times and interpret the result as though it was only one test, if adastop is run several times this is multiple testing and some calibration has to be done. Instead, it is better to do all the comparisons at the same time, running the adastop algorithm only once, and adastop will handle the multiplicity of hypotheses by itself.

Expand Down
37 changes: 32 additions & 5 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,57 @@
import pytest
from click.testing import CliRunner
from adastop.cli import adastop
import os

# we reuse a bit of pytest's own testing machinery, this should eventually come
import subprocess


def test_cli():
runner = CliRunner()
test_pdf_path = "test.pdf"

if os.path.exists(test_pdf_path):
os.remove(test_pdf_path)

result = runner.invoke(adastop, ['reset', 'examples'])
assert result.exit_code == 0
for j in range(1,6):

result = runner.invoke(adastop, ['compare', 'examples/walker'+str(j)+'.csv'])
result = runner.invoke(adastop, ['compare', "--seed", "1", 'examples/walker'+str(j)+'.csv'])
assert result.exit_code == 0

result = runner.invoke(adastop, ['compare', 'examples/walker3.csv'])
result = runner.invoke(adastop, ['compare',"--seed", "1", 'examples/walker3.csv'])
assert result.exit_code == 1
assert result.exception.args[0] == 'Error: you tried to use more group than necessary. Use adastop status to see current status for more info.'

result = runner.invoke(adastop, ['plot', 'examples', "test.pdf"])

result = runner.invoke(adastop, ['plot', 'examples', test_pdf_path])
assert result.exit_code == 0
result = runner.invoke(adastop, ['status', 'examples'])
assert result.exit_code == 0
assert os.path.exists(test_pdf_path) == True


result = runner.invoke(adastop, ['reset', 'examples'])
assert result.exit_code == 0

result = runner.invoke(adastop, ['compare', "--compare-to-first", 'examples/walker1.csv'])
result = runner.invoke(adastop, ['compare', "--compare-to-first","--seed", "1", 'examples/walker1.csv'])
assert result.exit_code == 0



def test_plot_no_comparator_save_file():
runner = CliRunner()
runner.invoke(adastop, ['reset', 'examples'])

result = runner.invoke(adastop, ['plot', 'examples', "test.pdf"])
assert result.exit_code == 1
assert result.exception.args[0] == 'Comparator save file not found.'

def test_status_no_comparator_save_file():
runner = CliRunner()
runner.invoke(adastop, ['reset', 'examples'])

result = runner.invoke(adastop, ['status', 'examples'])
assert result.exit_code == 1
assert result.exception.args[0] == 'Comparator save file not found.'
31 changes: 23 additions & 8 deletions tests/test_error_toy.py → tests/test_compare_agents.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,27 @@
B = 5000
alpha = 0.05
n_runs = 10
seed = 42

def test_runtime():
def test_partial_compare():
rng = np.random.RandomState(seed)
idxs = []
comparator = MultipleAgentsComparator(n=3, K=3, B=B, alpha=alpha, seed=42, beta = 0.01)
evals = {"Agent "+str(k):np.random.normal(size=3) for k in range(3)}
evals = {"Agent "+str(k): rng.normal(size=3) for k in range(3)}
comparator.partial_compare(evals)


def test_partial_compare_not_enough_points():
comparator = MultipleAgentsComparator(n=3, K=3, B=5000, alpha=-1e-5, seed=42, beta = 0.01)
evals = {"Agent 1":np.array([0,0,0]),"Agent 2":np.array([0,0,0]),"Agent 3":np.array([0,0,0])}
comparator.partial_compare(evals)



@pytest.mark.parametrize("K,n", [(10,2),(5,3), (3, 5), (1, 15)])
def test_type1(K,n):
rng = np.random.RandomState(seed)

idxs = []
n_agents = 3
for M in range(n_runs):
Expand All @@ -23,16 +34,18 @@ def test_type1(K,n):
while not comparator.is_finished:
if len(evals) >0:
for k in range(n_agents):
evals["Agent "+str(k)] = np.hstack([evals["Agent "+str(k)] ,np.random.normal(size=n)])
evals["Agent "+str(k)] = np.hstack([evals["Agent "+str(k)] , rng.normal(size=n)])
else:
evals = {"Agent "+str(k): np.random.normal(size=n) for k in range(n_agents)}
evals = {"Agent "+str(k): rng.normal(size=n) for k in range(n_agents)}
comparator.partial_compare(evals)
idxs.append(not("equal" in comparator.decisions.values()))
print(comparator.get_results())
assert np.mean(idxs) < 2*alpha + 1/4/(np.sqrt(n_runs)), "type 1 error seems to be too large."

@pytest.mark.parametrize("K,n", [(5,3), (3, 5), (1, 15)])
def test_type1_large_beta(K,n):
rng = np.random.RandomState(seed)

idxs = []
n_agents = 3
for M in range(n_runs):
Expand All @@ -41,16 +54,18 @@ def test_type1_large_beta(K,n):
while not comparator.is_finished:
if len(evals) >0:
for k in range(n_agents):
evals["Agent "+str(k)] = np.hstack([evals["Agent "+str(k)] ,np.random.normal(size=n)])
evals["Agent "+str(k)] = np.hstack([evals["Agent "+str(k)] , rng.normal(size=n)])
else:
evals = {"Agent "+str(k): np.random.normal(size=n) for k in range(n_agents)}
evals = {"Agent "+str(k): rng.normal(size=n) for k in range(n_agents)}
comparator.partial_compare(evals)
idxs.append(not("equal" in comparator.decisions.values()))
print(comparator.get_results())
assert np.mean(idxs) < 2*alpha + 1/4/(np.sqrt(n_runs)), "type 1 error seems to be too large."

@pytest.mark.parametrize("K,n", [(3, 5), (1, 15)])
def test_type2(K,n):
rng = np.random.RandomState(seed)

idxs = []
n_agents = 2
for M in range(n_runs):
Expand All @@ -59,9 +74,9 @@ def test_type2(K,n):
while not comparator.is_finished:
if len(evals) >0:
for k in range(n_agents):
evals["Agent "+str(k)] = np.hstack([evals["Agent "+str(k)] ,np.random.normal(size=n)+2*k])
evals["Agent "+str(k)] = np.hstack([evals["Agent "+str(k)] , rng.normal(size=n)+2*k])
else:
evals = {"Agent "+str(k): np.random.normal(size=n)+2*k for k in range(n_agents)}
evals = {"Agent "+str(k): rng.normal(size=n)+2*k for k in range(n_agents)}
comparator.partial_compare(evals)
idxs.append(not("equal" in comparator.decisions.values()))
assert np.mean(idxs) > 0.3, "type 2 error seems to be too large."
Expand Down
36 changes: 36 additions & 0 deletions tests/test_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,3 +76,39 @@ def test_plot_sota_noteq():
# plt.savefig('fig2.pdf')
fig, axes= plt.subplots(1,2)
comparator.plot_results_sota(axes=axes)



def test_plot_noteq2():
n_agents = 3
comparator = MultipleAgentsComparator(n=10, K=K, B=B, alpha=alpha, seed=42, beta = 0.01)
evals = {}
while not comparator.is_finished:
if len(evals) >0:
for k in range(n_agents):
evals["Agent "+str(k)] = np.hstack([evals["Agent "+str(k)] , np.abs(2*K-k)+np.random.normal(size=10)])
else:
evals = {"Agent "+str(k): np.random.normal(size=10)+np.abs(2*K-k) for k in range(n_agents)}
comparator.partial_compare(evals)
# plt.savefig('fig2.pdf')
fig, axes= plt.subplots(1,2)
comparator.plot_results(axes=axes)

def test_plot_sota_noteq2():
n_agents = 3
comparisons = np.array([(0,i) for i in [1,2]])
comparator = MultipleAgentsComparator(n=10, K=K, B=B, alpha=alpha,
comparisons=comparisons, seed=42, beta = 0.01)
evals = {}
while not comparator.is_finished:
if len(evals) >0:
for k in range(n_agents):
evals["Agent "+str(k)] = np.hstack([evals["Agent "+str(k)] ,np.random.normal(size=10)+np.abs(2*K-k)])
else:
evals = {"Agent "+str(k): np.random.normal(size=10)+np.abs(2*K-k) for k in range(n_agents)}
comparator.partial_compare(evals)
comparator.plot_results_sota()
# plt.savefig('fig2.pdf')
fig, axes= plt.subplots(1,2)
comparator.plot_results_sota(axes=axes)