Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
MarcCote committed Nov 10, 2022
1 parent 311f89c commit 547b1b7
Show file tree
Hide file tree
Showing 6 changed files with 198 additions and 189 deletions.
30 changes: 15 additions & 15 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -26,29 +26,29 @@ ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
ENV NVIDIA_VISIBLE_DEVICES=all
ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility

EXPOSE 5001 8883 8888 9000
EXPOSE 25300-25600
#EXPOSE 5001 8883 8888 9000
#EXPOSE 25300-25600
USER root:root
WORKDIR /opt
RUN wget http://nlp.stanford.edu/software/stanford-corenlp-full-2018-10-05.zip
#RUN wget http://nlp.stanford.edu/software/stanford-corenlp-full-2018-10-05.zip
RUN apt-get update \
&& apt-get install -y --no-install-recommends default-jre
RUN apt-get install -y --no-install-recommends unzip
RUN unzip stanford-corenlp-full-2018-10-05.zip \
&& mv $(ls -d stanford-corenlp-full-*/) corenlp \
&& rm *.zip
EXPOSE 5002-5100
EXPOSE 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018
EXPOSE 50022 50023 50024 50025 50026 50027 50028 50029 50030 50031 50032 50034 50035 50036 50037 50038 50039
EXPOSE 50022 50032 50042 50052 50062 50072 50082 50092 50102 50112 50122 50132 50142 50152 50162 50172 50182
COPY . /tdqn-scienceworld
RUN pip install -r /tdqn-scienceworld/requirements.txt
#RUN apt-get install -y --no-install-recommends unzip
#RUN unzip stanford-corenlp-full-2018-10-05.zip \
# && mv $(ls -d stanford-corenlp-full-*/) corenlp \
# && rm *.zip
#EXPOSE 5002-5100
#EXPOSE 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018
#EXPOSE 50022 50023 50024 50025 50026 50027 50028 50029 50030 50031 50032 50034 50035 50036 50037 50038 50039
#EXPOSE 50022 50032 50042 50052 50062 50072 50082 50092 50102 50112 50122 50132 50142 50152 50162 50172 50182
COPY . /drrn-scienceworld
RUN pip install -r /drrn-scienceworld/requirements.txt

RUN pip3 install torch==1.10.1+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html

WORKDIR /

ENV PYTHONPATH=/tdqn-scienceworld/drrn
ENV PYTHONPATH=/drrn-scienceworld/drrn
ENV HOME=""

WORKDIR /tdqn-scienceworld/drrn
WORKDIR /drrn-scienceworld/drrn
18 changes: 10 additions & 8 deletions beaker/batchSubmission.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
- name: sciworld-may31-drrn-8x100k-taskTASKID-seedSEEDNUM
image:
beaker: peterj/sciworld-drrn2c
arguments: [python3, train-scienceworld.py, --num_envs=8, --max_steps=100000, --task_idx=TASKID, --simplification_str=easy, --priority_fraction=0.50, --memory_size=100000, --env_step_limit=100, --log_freq=100, --checkpoint_freq=100000, --eval_freq=2000, --seed=SEEDNUM, --maxHistoriesPerFile=1000, --historySavePrefix=/results1/drrn1/results-seedSEEDNUM]
arguments: [python3, train-scienceworld.py, --num_envs=8, --max_steps=100000, --task_idx=TASKID, --simplification_str=easy, --priority_fraction=0.50, --memory_size=100000, --env_step_limit=100, --log_freq=100, --checkpoint_freq=5000, --eval_freq=1000, --seed=SEEDNUM]
result:
path: /results1/drrn1/
resources:
Expand All @@ -20,10 +20,11 @@
cluster: ai2/raja_p100
priority: normal
"""
template_command = "python train-scienceworld.py --num_envs=8 --max_steps=100000 --task_idx=TASKID --simplification_str=easy --priority_fraction=0.50 --memory_size=100000 --env_step_limit=100 --log_freq=100 --checkpoint_freq=5000 --eval_freq=1000 --seed=SEEDNUM --output_dir logs/drrn-8x100k-taskTASKID-seedSEEDNUM"


def populateTemplate(taskId, seedNum):
outStr = templateStr
outStr = template_command
outStr = outStr.replace("SEEDNUM", str(seedNum))
outStr = outStr.replace("TASKID", str(taskId))

Expand All @@ -47,17 +48,18 @@ def submitJob(filenameToRun):

numJobs = 0
for seed in range(0, 1):
for taskIdx in range(0, 30):
for taskIdx in range(0, 30):
tempFilename = "submit.yml"

print("Creating job (" + str(numJobs) + "): Task: " + str(taskIdx) + " seed: " + str(seed))
#print("Creating job (" + str(numJobs) + "): Task: " + str(taskIdx) + " seed: " + str(seed))
scriptStr = populateTemplate(taskIdx, seed)
writeTemplate(tempFilename, scriptStr)
submitJob(tempFilename)
#writeTemplate(tempFilename, scriptStr)
print(scriptStr)
#submitJob(tempFilename)

time.sleep(1)
#time.sleep(1)
numJobs += 1
print("")
#print("")
#print(populateTemplate(10, 2))

print("Submitted " + str(numJobs) + " jobs.")
21 changes: 12 additions & 9 deletions drrn/drrn.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,14 +73,17 @@ def build_state(self, obs, infos):
""" Returns a state representation built from various info sources. """
obs_ids = [self.sp.EncodeAsIds(o) for o in obs]
# TextWorld
look_ids = [self.sp.EncodeAsIds(info['look']) for info in infos]
inv_ids = [self.sp.EncodeAsIds(info['inv']) for info in infos]
#look_ids = [self.sp.EncodeAsIds(info['look']) for info in infos]
#inv_ids = [self.sp.EncodeAsIds(info['inv']) for info in infos]
look_ids = [self.sp.EncodeAsIds(look) for look in infos['look']]
inv_ids = [self.sp.EncodeAsIds(inv) for inv in infos['inv']]

# ScienceWorld

#print("obs:")
#print(obs)
#print("infos:")
#print(infos)
#print(infos)
#look_ids = [self.sp.EncodeAsIds(info['look']) for info in infos]
#inv_ids = [self.sp.EncodeAsIds(info['inv']) for info in infos]

Expand Down Expand Up @@ -146,11 +149,11 @@ def save(self, suffixStr=""):
print("Saving agent to path: " + str(self.save_path))
print("Started saving at: " + str(startTime))
sys.stdout.flush()

# First, remove any old backups
print("Removing old backups")
sys.stdout.flush()
try:
try:
files = os.listdir(self.save_path + "/bak")
for filename in files:
if (filename.startswith("memory")) or (filename.startswith("model") or (filename.startswith("progress") or (filename.startswith("log")))):
Expand All @@ -167,9 +170,9 @@ def save(self, suffixStr=""):
os.makedirs(self.save_path + "/bak", exist_ok=True)
files = os.listdir(self.save_path)
for filename in files:
if filename.startswith("memory") or filename.startswith("model"):
if filename.startswith("memory") or filename.startswith("model"):
shutil.move(self.save_path + "/" + filename, self.save_path + "/bak/" + filename)
if filename.startswith("progress") or filename.startswith("log"):
if filename.startswith("progress") or filename.startswith("log"):
shutil.copy(self.save_path + "/" + filename, self.save_path + "/bak/" + filename)


Expand All @@ -181,7 +184,7 @@ def save(self, suffixStr=""):

self.lastSaveSuccessful = False
with timeout(120):
print("Pickle")
print("Pickle")
print("Length: " + str(len(self.memory)) )
sys.stdout.flush()
pickle.dump(self.memory, open(pjoin(self.save_path, "memory" + str(suffixStr) + ".pkl"), 'wb'))
Expand All @@ -195,7 +198,7 @@ def save(self, suffixStr=""):
if (self.lastSaveSuccessful == False):
print("* Model failed to save (timeout).")
self.numSaveErrors += 1

print("Total number of save timeouts since running: " + str(self.numSaveErrors))

sys.stdout.flush()
Expand Down
Loading

0 comments on commit 547b1b7

Please sign in to comment.