diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
index 1f5d5a27..1311a1a0 100644
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -18,8 +18,8 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  prep-smp-ubuntu-2204-docker:
-    name: gcc / ${{ matrix.build-type }} / ${{ matrix.sanitizer-type }}
+  prep-mpi-ubuntu-2204-docker:
+    name: gcc / ${{ matrix.build-type }}
     runs-on: self-hosted
     permissions:
       contents: read
@@ -35,13 +35,7 @@ jobs:
         shell: bash -l {0}
     strategy:
       matrix:
-        build-type: ['Release', 'RelWithDebInfo']
-        sanitizer-type: ['nosan', 'san']
-        exclude:
-          - build-type: 'RelWithDebInfo'
-            sanitizer-type: 'nosan'
-          - build-type: 'Release'
-            sanitizer-type: 'san'
+        build-type: ['Release', 'Sanitize']
 
     steps:
 
@@ -60,13 +54,21 @@ jobs:
         echo "SRC_DIR=$(pwd)" >> $GITHUB_ENV
         echo "PANDO_TEST_DISCOVERY_TIMEOUT=600" >> $GITHUB_ENV
         echo "IMAGE_VERSION=$(git log --pretty="%h" -1 Dockerfile.dev)" >> $GITHUB_ENV
-        if [ ${{ matrix.sanitizer-type }} == 'san' ]; then
+        if [ ${{ matrix.build-type }} == 'Sanitize' ]; then
           echo "PANDO_BUILD_DOCS=OFF" >> $GITHUB_ENV
-          echo "PANDO_CONTAINER_ENV=-e=PANDO_PREP_L1SP_HART=32768 -ePANDO_PREP_MAIN_NODE=8589934592 -e=PANDO_EXTRA_CXX_FLAGS='\"-fsanitize=address -fsanitize=undefined\"'" >> $GITHUB_ENV
+          echo "PANDO_CONTAINER_ENV=-e=PANDO_PREP_L1SP_HART=32768 -ePANDO_PREP_MAIN_NODE=8589934592" >> $GITHUB_ENV
         fi
-        if [ ${{ matrix.sanitizer-type }} == 'nosan' ]; then
+        if [ ${{ matrix.build-type }} == 'Release' ]; then
+          echo "PANDO_BUILD_DOCS=OFF" >> $GITHUB_ENV
           echo "PANDO_CONTAINER_ENV=-e=PANDORT_TESTS=ON" >> $GITHUB_ENV
         fi
+        echo ${{ runner.name }}
+        if [ ${{ runner.name }} == 'zerberus-0' ]; then
+          echo "CONTAINER_CPUSET='--cpuset-cpus=0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30'" >> $GITHUB_ENV
+        fi
+        if [ ${{ runner.name }} == 'zerberus-1' ]; then
+          echo "CONTAINER_CPUSET='--cpuset-cpus=1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31'" >> $GITHUB_ENV
+        fi
         cat $GITHUB_ENV
 
     - name: Configure
diff --git a/.github/workflows/drivex.yml b/.github/workflows/drivex.yml
index a1dc2a8a..c20570de 100644
--- a/.github/workflows/drivex.yml
+++ b/.github/workflows/drivex.yml
@@ -19,7 +19,7 @@ concurrency:
 
 jobs:
   docker-drivex-ubuntu-2204:
-    name: gcc / ${{ matrix.build-type }} / ${{ matrix.sanitizer-type }}
+    name: gcc / ${{ matrix.build-type }}
     runs-on: self-hosted
     permissions:
       contents: read
@@ -36,7 +36,6 @@ jobs:
     strategy:
       matrix:
         build-type: ['Release']
-        sanitizer-type: ['nosan']
 
     steps:
 
@@ -56,6 +55,13 @@ jobs:
         echo "PANDO_TEST_DISCOVERY_TIMEOUT=600" >> $GITHUB_ENV
         echo "IMAGE_VERSION=$(git log --pretty="%h" -1 Dockerfile.dev)" >> $GITHUB_ENV
         echo "PANDO_CONTAINER_ENV=-e=PANDORT_TESTS=ON" >> $GITHUB_ENV
+        echo ${{ runner.name }}
+        if [ ${{ runner.name }} == 'zerberus-0' ]; then
+          echo "CONTAINER_CPUSET='--cpuset-cpus=0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30'" >> $GITHUB_ENV
+        fi
+        if [ ${{ runner.name }} == 'zerberus-1' ]; then
+          echo "CONTAINER_CPUSET='--cpuset-cpus=1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31'" >> $GITHUB_ENV
+        fi
         cat $GITHUB_ENV
 
     - name: Configure
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a40d51c9..c5a80d40 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -20,6 +20,7 @@ endif()
 option(ENABLE_PANDORT_TESTS "Enable pando-rt to run tests" OFF)
 
 if (NOT ENABLE_PANDORT_TESTS)
+  message("Not Enabling Pandort testing")
   set(BUILD_TESTING_SAVED "${BUILD_TESTING}")
   set(BUILD_TESTING OFF)
 endif()
@@ -43,13 +44,33 @@ if ("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_BINARY_DIR}")
     )
 endif()
 
+#create the sanitize build type
+set(CMAKE_CXX_FLAGS_SANITIZE
+  "-O2 -g -fsanitize=address -fsanitize=undefined -DNDEBUG" CACHE STRING "Flags used by the C++ compiler during sanitizer builds"
+  FORCE )
+set(CMAKE_C_FLAGS_SANITIZE
+  "-O2 -g -fsanitize=address -fsanitize=undefined -DNDEBUG" CACHE STRING "Flags used by the C compiler during sanitizer builds"
+  FORCE )
+set(CMAKE_EXE_LINKER_FLAGS_SANITIZE
+  "" CACHE STRING "Flags used for linking binaries during sanitizer builds"
+  FORCE )
+set(CMAKE_SHARED_LINKER_FLAGS_SANITIZE
+  "" CACHE STRING "Flags used for linking shared libraries during sanitizer builds"
+  FORCE )
+
+MARK_AS_ADVANCED(
+  CMAKE_CXX_FLAGS_SANITIZE
+  CMAKE_C_FLAGS_SANITIZE
+  CMAKE_EXE_LINKER_FLAGS_SANITIZE
+  CMAKE_SHARED_LINKER_FLAGS_SANITIZE)
+
 # default build type
+set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo" "Sanitize" "Coverage")
 set(DEFAULT_BUILD_TYPE "Release")
 if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
   message(STATUS "Setting build type to default '${DEFAULT_BUILD_TYPE}' as none was specified.")
   set(CMAKE_BUILD_TYPE "${DEFAULT_BUILD_TYPE}" CACHE STRING "Choose the type of build." FORCE)
   # possible values of build type for cmake-gui
-  set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
 endif ()
 
 # target
diff --git a/Makefile b/Makefile
index 45997b12..f2c061c3 100644
--- a/Makefile
+++ b/Makefile
@@ -18,6 +18,7 @@ CONTAINER_BUILD_DIR ?= /pando/dockerbuild
 CONTAINER_WORKDIR ?= ${CONTAINER_SRC_DIR}
 CONTAINER_CONTEXT ?= default
 CONTAINER_OPTS ?=
+CONTAINER_CPUSET ?=
 CONTAINER_CMD ?= setarch `uname -m` -R bash -l
 INTERACTIVE ?= i
 
@@ -110,6 +111,7 @@ docker:
 	-v ${SRC_DIR}/:${CONTAINER_SRC_DIR} \
 	${PANDO_CONTAINER_MOUNTS} \
 	${PANDO_CONTAINER_ENV} \
+	${CONTAINER_CPUSET} \
 	--privileged \
 	--workdir=${CONTAINER_WORKDIR} ${CONTAINER_OPTS} -${INTERACTIVE}t \
 	${IMAGE_NAME}:${VERSION} \
@@ -181,19 +183,19 @@ drive-deps:
 run-tests-mpi:
 	set -o pipefail && \
 	. ~/.profile && \
-	cd ${CONTAINER_BUILD_DIR} && ctest -j4 --verbose | tee test.out && \
+	cd ${CONTAINER_BUILD_DIR} && ctest -j2 --verbose | tee test.out && \
 	! grep -E "Failure" test.out && ! grep -E "runtime error" test.out
 
 run-tests-smp:
 	set -o pipefail && \
 	. ~/.profile && \
-	cd ${CONTAINER_BUILD_DIR}-smp && ctest -j4 --verbose | tee test.out && \
+	cd ${CONTAINER_BUILD_DIR}-smp && ctest -j2 --verbose | tee test.out && \
 	! grep -E "Failure" test.out && ! grep -E "runtime error" test.out
 
 run-tests-drv:
 	set -o pipefail && \
 	. ~/.profile && \
-	cd ${DRV_BUILD_DIR} && ctest -j4 --verbose | tee test.out && \
+	cd ${DRV_BUILD_DIR} && ctest -j2 --verbose | tee test.out && \
 	! grep -E "Failure" test.out && ! grep -E "runtime error" test.out
 
 run-tests: run-tests-mpi
diff --git a/cmake/PANDOTesting.cmake b/cmake/PANDOTesting.cmake
index 48ca8497..0f8b78c0 100644
--- a/cmake/PANDOTesting.cmake
+++ b/cmake/PANDOTesting.cmake
@@ -148,3 +148,36 @@ function(pando_add_bin_test TARGET ARGS INPUTFILE OKFILE)
 
   endif()
 endfunction()
+
+function(pando_add_bin_python_test TARGET ARGS INPUTFILE)
+  if (NOT PANDO_RT_BACKEND STREQUAL "DRVX")
+    if (${GASNet_CONDUIT} STREQUAL "smp")
+      set(DRIVER_SCRIPT ${PROJECT_SOURCE_DIR}/pando-rt/scripts/preprun.sh)
+    elseif (${GASNet_CONDUIT} STREQUAL "mpi")
+      set(DRIVER_SCRIPT ${PROJECT_SOURCE_DIR}/pando-rt/scripts/preprun_mpi.sh)
+    else ()
+      message(FATAL_ERROR "No runner script for GASNet conduit ${GASNet_CONDUIT}")
+    endif ()
+
+    set(NUM_PXNS 2)
+    set(NUM_CORES 4)
+
+    add_test(NAME ${TARGET}-${INPUTFILE}-pythonvalidate
+      COMMAND bash -c "${DRIVER_SCRIPT} -n ${NUM_PXNS} -c ${NUM_CORES} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET} ${ARGS} ${INPUTFILE} | python3 ${PROJECT_SOURCE_DIR}/scripts/mirror_master_validate.py")
+
+  else()
+
+    set(DRIVER_SCRIPT ${PROJECT_SOURCE_DIR}/scripts/run-drv.sh)
+
+    set(NUM_PXNS 2)
+    set(NUM_CORES 4)
+    set(NUM_HTHREADS 8)
+
+    get_filename_component(FNAME ${TARGET} NAME)
+
+    add_test(NAME ${TARGET}-${INPUTFILE}-pythonvalidate
+      COMMAND bash -c "LAUNCH_DIR=${CMAKE_SOURCE_DIR} ${DRIVER_SCRIPT} -p ${NUM_HTHREADS} -n ${NUM_PXNS} -c ${NUM_CORES} \
+      ${CMAKE_CURRENT_BINARY_DIR}/lib${FNAME}.so ${ARGS} ${INPUTFILE} | python3 ${PROJECT_SOURCE_DIR}/scripts/mirror_master_validate.py")
+
+  endif()
+endfunction()
diff --git a/include/pando-lib-galois/containers/host_cached_array.hpp b/include/pando-lib-galois/containers/host_cached_array.hpp
new file mode 100644
index 00000000..894cf66a
--- /dev/null
+++ b/include/pando-lib-galois/containers/host_cached_array.hpp
@@ -0,0 +1,341 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023. University of Texas at Austin. All rights reserved.
+
+#ifndef PANDO_LIB_GALOIS_CONTAINERS_HOST_CACHED_ARRAY_HPP_
+#define PANDO_LIB_GALOIS_CONTAINERS_HOST_CACHED_ARRAY_HPP_
+
+#include <algorithm>
+#include <cstdint>
+#include <iterator>
+#include <utility>
+
+#include "pando-rt/export.h"
+#include <pando-lib-galois/containers/host_indexed_map.hpp>
+#include <pando-lib-galois/containers/host_local_storage.hpp>
+#include <pando-lib-galois/loops/do_all.hpp>
+#include <pando-lib-galois/utility/gptr_monad.hpp>
+#include <pando-rt/containers/array.hpp>
+#include <pando-rt/memory/allocate_memory.hpp>
+#include <pando-rt/memory/global_ptr.hpp>
+
+namespace galois {
+
+template <typename T>
+class HostCachedArrayIterator;
+
+/**
+ * @brief This is an array like container that has an array on each host */
+template <typename T>
+class HostCachedArray {
+public:
+  HostCachedArray() noexcept = default;
+
+  HostCachedArray(const HostCachedArray&) = default;
+  HostCachedArray(HostCachedArray&&) = default;
+
+  ~HostCachedArray() = default;
+
+  HostCachedArray& operator=(const HostCachedArray&) = default;
+  HostCachedArray& operator=(HostCachedArray&&) = default;
+
+  using iterator = HostCachedArrayIterator<T>;
+  using reverse_iterator = std::reverse_iterator<iterator>;
+
+  /**
+   * @brief Takes in iterators with semantics like memoryType and a size to initialize the sizes of
+   * the objects
+   *
+   * @tparam It the iterator type
+   * @param[in] beg The beginning of the iterator to memoryType like objects
+   * @param[in] end The end of the iterator to memoryType like objects
+   * @param[in] size The size of the data to encapsulate in this abstraction
+   */
+  template <typename Range>
+  [[nodiscard]] pando::Status initialize(Range range) {
+    assert(range.size() == m_data.size());
+    size_ = 0;
+    PANDO_CHECK_RETURN(m_data.initialize());
+    PANDO_CHECK_RETURN(galois::doAll(
+        range, m_data,
+        +[](Range range, pando::GlobalRef<galois::HostIndexedMap<pando::Array<T>>> data) {
+          PANDO_CHECK(lift(data, initialize));
+          auto ref = lift(data, getLocalRef);
+          PANDO_CHECK(fmap(
+              ref, initialize,
+              *(range.begin() + static_cast<std::uint64_t>(pando::getCurrentPlace().node.id))));
+        }));
+    PANDO_CHECK_RETURN(galois::doAll(
+        m_data, m_data,
+        +[](decltype(m_data) complete, galois::HostIndexedMap<pando::Array<T>> data) {
+          for (std::uint64_t i = 0; i < data.size(); i++) {
+            data[i] = fmap(complete[i], operator[], i);
+          }
+        }));
+    for (std::uint64_t i = 0; i < m_data.size(); i++) {
+      auto ref = fmap(m_data[i], operator[], i);
+      size_ += lift(ref, size);
+    }
+    return pando::Status::Success;
+  }
+
+  void deinitialize() {
+    PANDO_CHECK(galois::doAll(
+        m_data, +[](galois::HostIndexedMap<pando::Array<T>> data) {
+          const std::uint64_t i = static_cast<std::uint64_t>(pando::getCurrentPlace().node.id);
+          auto ref = data[i];
+          liftVoid(ref, deinitialize);
+          liftVoid(data, deinitialize);
+        }));
+    m_data.deinitialize();
+  }
+
+  /**
+   * @brief Returns a pointer to the given index within a specific host
+   * @warning this is unsafe
+   */
+  pando::GlobalPtr<T> getSpecific(std::uint64_t host, std::uint64_t localIdx) noexcept {
+    HostIndexedMap<pando::Array<T>> cache = m_data.getLocalRef();
+    return &fmap(cache[host], get, localIdx);
+  }
+
+  /**
+   * @brief Returns a pointer to the given index within a specific host
+   * @warning this is unsafe
+   */
+  pando::GlobalRef<T> getSpecificRef(std::uint64_t host, std::uint64_t localIdx) noexcept {
+    return *this->getSpecific(host, localIdx);
+  }
+
+  /**
+   * @brief Returns a pointer to the given index
+   */
+  pando::GlobalPtr<const T> get(std::uint64_t i) const noexcept {
+    HostIndexedMap<pando::Array<T>> cache = m_data.getLocalRef();
+    auto curr = cache.begin();
+    for (; curr != cache.end(); curr++) {
+      auto size = lift(*curr, size);
+      if (i < size) {
+        break;
+      }
+      i -= size;
+    }
+    if (curr == cache.end())
+      return nullptr;
+    return &fmap(*curr, get, i);
+  }
+
+  /**
+   * @brief Returns a pointer to the given index
+   */
+  pando::GlobalPtr<T> get(std::uint64_t i) noexcept {
+    HostIndexedMap<pando::Array<T>> cache = m_data.getLocalRef();
+    auto curr = cache.begin();
+    for (; curr != cache.end(); curr++) {
+      auto size = lift(*curr, size);
+      if (i < size) {
+        break;
+      }
+      i -= size;
+    }
+    if (curr == cache.end())
+      return nullptr;
+    pando::GlobalRef<pando::Array<T>> arr = *curr;
+    return &fmap(*curr, get, i);
+  }
+
+  constexpr pando::GlobalRef<T> operator[](std::uint64_t pos) noexcept {
+    return *this->get(pos);
+  }
+
+  constexpr pando::GlobalRef<const T> operator[](std::uint64_t pos) const noexcept {
+    return *this->get(pos);
+  }
+
+  constexpr bool empty() const noexcept {
+    return this->size() == 0;
+  }
+
+  constexpr std::uint64_t size() noexcept {
+    return size_;
+  }
+
+  constexpr std::uint64_t size() const noexcept {
+    return size_;
+  }
+
+  constexpr std::uint64_t capacity() noexcept {
+    return size();
+  }
+
+  iterator begin() noexcept {
+    return iterator(*this, 0);
+  }
+
+  iterator begin() const noexcept {
+    return iterator(*this, 0);
+  }
+
+  iterator end() noexcept {
+    return iterator(*this, size_);
+  }
+
+  iterator end() const noexcept {
+    return iterator(*this, size_);
+  }
+
+  /**
+   * @brief reverse iterator to the first element
+   */
+  reverse_iterator rbegin() noexcept {
+    return reverse_iterator(end()--);
+  }
+
+  /**
+   * @copydoc rbegin()
+   */
+  reverse_iterator rbegin() const noexcept {
+    return reverse_iterator(end()--);
+  }
+
+  /**
+   * @brief reverse iterator to the last element
+   */
+  reverse_iterator rend() noexcept {
+    return reverse_iterator(begin()--);
+  }
+
+  /**
+   * @copydoc rend()
+   */
+  reverse_iterator rend() const noexcept {
+    return reverse_iterator(begin()--);
+  }
+
+  friend bool operator==(const HostCachedArray& a, const HostCachedArray& b) {
+    return a.size() == b.size() && a.m_data.getLocal() == b.m_data.getLocal();
+  }
+
+private:
+  /// @brief The data structure storing the data this stores a cache once constructed
+  galois::HostLocalStorage<galois::HostIndexedMap<pando::Array<T>>> m_data;
+  /// @brief Stores the amount of data in the array, may be less than allocated
+  uint64_t size_ = 0;
+};
+
+/**
+ * @brief an iterator that stores the DistArray and the current position to provide random access
+ * iterator semantics
+ */
+template <typename T>
+class HostCachedArrayIterator {
+  HostCachedArray<T> m_arr;
+  std::uint64_t m_pos;
+
+public:
+  using iterator_category = std::random_access_iterator_tag;
+  using difference_type = std::int64_t;
+  using value_type = T;
+  using pointer = pando::GlobalPtr<T>;
+  using reference = pando::GlobalRef<T>;
+
+  HostCachedArrayIterator(HostCachedArray<T> arr, std::uint64_t pos) : m_arr(arr), m_pos(pos) {}
+
+  constexpr HostCachedArrayIterator() noexcept = default;
+  constexpr HostCachedArrayIterator(HostCachedArrayIterator&&) noexcept = default;
+  constexpr HostCachedArrayIterator(const HostCachedArrayIterator&) noexcept = default;
+  ~HostCachedArrayIterator() = default;
+
+  constexpr HostCachedArrayIterator& operator=(const HostCachedArrayIterator&) noexcept = default;
+  constexpr HostCachedArrayIterator& operator=(HostCachedArrayIterator&&) noexcept = default;
+
+  reference operator*() const noexcept {
+    return m_arr[m_pos];
+  }
+
+  reference operator*() noexcept {
+    return m_arr[m_pos];
+  }
+
+  pointer operator->() {
+    return m_arr.get(m_pos);
+  }
+
+  HostCachedArrayIterator& operator++() {
+    m_pos++;
+    return *this;
+  }
+
+  HostCachedArrayIterator operator++(int) {
+    HostCachedArrayIterator tmp = *this;
+    ++(*this);
+    return tmp;
+  }
+
+  HostCachedArrayIterator& operator--() {
+    m_pos--;
+    return *this;
+  }
+
+  HostCachedArrayIterator operator--(int) {
+    HostCachedArrayIterator tmp = *this;
+    --(*this);
+    return tmp;
+  }
+
+  constexpr HostCachedArrayIterator operator+(std::uint64_t n) const noexcept {
+    return HostCachedArrayIterator(m_arr, m_pos + n);
+  }
+
+  constexpr HostCachedArrayIterator& operator+=(std::uint64_t n) noexcept {
+    m_pos += n;
+    return *this;
+  }
+
+  constexpr HostCachedArrayIterator operator-(std::uint64_t n) const noexcept {
+    return HostCachedArrayIterator(m_arr, m_pos - n);
+  }
+
+  constexpr difference_type operator-(HostCachedArrayIterator b) const noexcept {
+    return m_pos - b.m_pos;
+  }
+
+  reference operator[](std::uint64_t n) noexcept {
+    return m_arr[m_pos + n];
+  }
+
+  reference operator[](std::uint64_t n) const noexcept {
+    return m_arr[m_pos + n];
+  }
+
+  friend bool operator==(const HostCachedArrayIterator& a, const HostCachedArrayIterator& b) {
+    return a.m_pos == b.m_pos && a.m_arr == b.m_arr;
+  }
+
+  friend bool operator!=(const HostCachedArrayIterator& a, const HostCachedArrayIterator& b) {
+    return !(a == b);
+  }
+
+  friend bool operator<(const HostCachedArrayIterator& a, const HostCachedArrayIterator& b) {
+    return a.m_pos < b.m_pos;
+  }
+
+  friend bool operator<=(const HostCachedArrayIterator& a, const HostCachedArrayIterator& b) {
+    return a.m_pos <= b.m_pos;
+  }
+
+  friend bool operator>(const HostCachedArrayIterator& a, const HostCachedArrayIterator& b) {
+    return a.m_pos > b.m_pos;
+  }
+
+  friend bool operator>=(const HostCachedArrayIterator& a, const HostCachedArrayIterator& b) {
+    return a.m_pos >= b.m_pos;
+  }
+
+  friend pando::Place localityOf(HostCachedArrayIterator& a) {
+    pando::GlobalPtr<T> ptr = &a.m_arr[a.m_pos];
+    return pando::localityOf(ptr);
+  }
+};
+} // namespace galois
+
+#endif // PANDO_LIB_GALOIS_CONTAINERS_HOST_CACHED_ARRAY_HPP_
diff --git a/include/pando-lib-galois/containers/host_local_storage.hpp b/include/pando-lib-galois/containers/host_local_storage.hpp
index 41f20ecc..629c724f 100644
--- a/include/pando-lib-galois/containers/host_local_storage.hpp
+++ b/include/pando-lib-galois/containers/host_local_storage.hpp
@@ -18,7 +18,7 @@ namespace galois {
 
 namespace HostLocalStorageHeap {
 
-constexpr std::uint64_t Size = 1 << 10;
+constexpr std::uint64_t Size = 1 << 20;
 constexpr std::uint64_t Granule = 128;
 struct ModestArray {
   std::byte arr[Size];
diff --git a/include/pando-lib-galois/containers/per_thread.hpp b/include/pando-lib-galois/containers/per_thread.hpp
index 0061a7e4..804dc510 100644
--- a/include/pando-lib-galois/containers/per_thread.hpp
+++ b/include/pando-lib-galois/containers/per_thread.hpp
@@ -389,7 +389,7 @@ class PerThreadVector {
 
     galois::PrefixSum<SRC, DST, SRC_Val, DST_Val, transmute, scan_op, combiner, galois::DistArray>
         prefixSum(m_data, m_indices);
-    PANDO_CHECK_RETURN(prefixSum.initialize());
+    PANDO_CHECK_RETURN(prefixSum.initialize(pando::getPlaceDims().node.id));
 
     prefixSum.computePrefixSum(m_indices.size());
     indices_computed = true;
diff --git a/include/pando-lib-galois/containers/thread_local_storage.hpp b/include/pando-lib-galois/containers/thread_local_storage.hpp
index bf0dd1a3..f8c8957d 100644
--- a/include/pando-lib-galois/containers/thread_local_storage.hpp
+++ b/include/pando-lib-galois/containers/thread_local_storage.hpp
@@ -37,13 +37,29 @@ class ThreadLocalStorage {
   using iterator = ThreadLocalStorageIt<T>;
   using reverse_iterator = std::reverse_iterator<iterator>;
 
+  [[nodiscard]] static constexpr std::uint64_t getThreadsPerCore() noexcept {
+    std::uint64_t threads = static_cast<std::uint64_t>(pando::getThreadDims().id);
+    return threads;
+  }
+
+  [[nodiscard]] static constexpr std::uint64_t getThreadsPerPod() noexcept {
+    const auto place = pando::getPlaceDims();
+    const std::uint64_t cores =
+        static_cast<std::uint64_t>(place.core.x) * static_cast<std::uint64_t>(place.core.y);
+    return cores * getThreadsPerCore();
+  }
+
+  [[nodiscard]] static constexpr std::uint64_t getThreadsPerHost() noexcept {
+    const auto place = pando::getPlaceDims();
+    const std::uint64_t pods =
+        static_cast<std::uint64_t>(place.pod.x) * static_cast<std::uint64_t>(place.pod.y);
+    return pods * getThreadsPerPod();
+  }
+
   [[nodiscard]] static constexpr std::uint64_t getNumThreads() noexcept {
     const auto place = pando::getPlaceDims();
     std::uint64_t nodes = static_cast<std::uint64_t>(place.node.id);
-    std::uint64_t pods = static_cast<std::uint64_t>(place.pod.x * place.pod.y);
-    std::uint64_t cores = static_cast<std::uint64_t>(place.core.x * place.core.y);
-    std::uint64_t threads = static_cast<std::uint64_t>(pando::getThreadDims().id);
-    return nodes * pods * cores * threads;
+    return nodes * getThreadsPerHost();
   }
 
   [[nodiscard]] constexpr std::uint64_t getCurrentThreadIdx() const noexcept {
diff --git a/include/pando-lib-galois/containers/thread_local_vector.hpp b/include/pando-lib-galois/containers/thread_local_vector.hpp
new file mode 100644
index 00000000..0893d14c
--- /dev/null
+++ b/include/pando-lib-galois/containers/thread_local_vector.hpp
@@ -0,0 +1,407 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023. University of Texas at Austin. All rights reserved.
+#ifndef PANDO_LIB_GALOIS_CONTAINERS_THREAD_LOCAL_VECTOR_HPP_
+#define PANDO_LIB_GALOIS_CONTAINERS_THREAD_LOCAL_VECTOR_HPP_
+#include <cstdint>
+#include <iterator>
+#include <utility>
+
+#include "pando-rt/export.h"
+#include <pando-lib-galois/containers/array.hpp>
+#include <pando-lib-galois/containers/host_cached_array.hpp>
+#include <pando-lib-galois/containers/thread_local_storage.hpp>
+#include <pando-lib-galois/utility/counted_iterator.hpp>
+#include <pando-lib-galois/utility/gptr_monad.hpp>
+#include <pando-lib-galois/utility/prefix_sum.hpp>
+#include <pando-rt/containers/vector.hpp>
+#include <pando-rt/memory/global_ptr.hpp>
+
+namespace galois {
+
+template <typename T>
+class ThreadLocalVector {
+public:
+  ThreadLocalVector() noexcept = default;
+  ThreadLocalVector(const ThreadLocalVector&) = default;
+  ThreadLocalVector(ThreadLocalVector&&) = default;
+
+  ~ThreadLocalVector() = default;
+
+  ThreadLocalVector& operator=(const ThreadLocalVector&) = default;
+  ThreadLocalVector& operator=(ThreadLocalVector&&) = default;
+
+  using iterator = ThreadLocalStorageIt<pando::Vector<T>>;
+  using reverse_iterator = std::reverse_iterator<iterator>;
+
+  [[nodiscard]] pando::Status initialize() {
+    pando::Vector<T> vec;
+    PANDO_CHECK_RETURN(vec.initialize(0));
+    this->m_data = PANDO_EXPECT_RETURN(galois::copyToAllThreads(vec));
+    return pando::Status::Success;
+  }
+
+  void deinitialize() {
+    if (indicesInitialized) {
+      m_indices.deinitialize();
+    }
+    for (pando::Vector<T> vec : m_data) {
+      vec.deinitialize();
+    }
+    m_data.deinitialize();
+  }
+
+  pando::GlobalPtr<pando::Vector<T>> getLocal() {
+    return m_data.getLocal();
+  }
+
+  pando::GlobalPtr<const pando::Vector<T>> getLocal() const noexcept {
+    return m_data.getLocal();
+  }
+
+  pando::GlobalRef<pando::Vector<T>> getLocalRef() {
+    return *m_data.getLocal();
+  }
+
+  pando::GlobalRef<const pando::Vector<T>> getLocalRef() const noexcept {
+    return *m_data.getLocal();
+  }
+
+  pando::GlobalPtr<pando::Vector<T>> get(std::uint64_t i) noexcept {
+    return m_data.get(i);
+  }
+
+  pando::GlobalPtr<const pando::Vector<T>> get(std::uint64_t i) const noexcept {
+    return m_data.get(i);
+  }
+
+  constexpr pando::GlobalRef<pando::Vector<T>> operator[](std::uint64_t pos) noexcept {
+    return *get(pos);
+  }
+
+  constexpr pando::GlobalRef<const pando::Vector<T>> operator[](std::uint64_t pos) const noexcept {
+    return *get(pos);
+  }
+
+  /**
+   * @brief Appends to the current hardware thread's vector.
+   */
+  [[nodiscard]] pando::Status pushBack(T val) {
+    return fmap(this->getLocalRef(), pushBack, val);
+  }
+
+  /**
+   * @brief Returns the total number of elements in the PerThreadVector
+   */
+  std::uint64_t sizeAll() const {
+    if (indicesComputed) {
+      return *m_indices.rbegin();
+    }
+    std::uint64_t size = 0;
+    for (std::uint64_t i = 0; i < m_data.size(); i++) {
+      pando::Vector<T> vec = m_data[i];
+      size += vec.size();
+    }
+    return size;
+  }
+
+  /**
+   * @brief Returns the total number of per thread vectors
+   */
+  size_t size() const {
+    return m_data.size();
+  }
+
+  void clear() {
+    indicesComputed = false;
+    for (std::uint64_t i = 0; i < m_data.size(); i++) {
+      liftVoid(m_data[i], clear);
+    }
+  }
+
+private:
+  static uint64_t transmute(pando::Vector<T> p) {
+    return p.size();
+  }
+  static uint64_t scan_op(pando::Vector<T> p, uint64_t l) {
+    return p.size() + l;
+  }
+  static uint64_t combiner(uint64_t f, uint64_t s) {
+    return f + s;
+  }
+
+public:
+  [[nodiscard]] pando::Status computeIndices() {
+    if (!indicesInitialized) {
+      PANDO_CHECK_RETURN(m_indices.initialize());
+      indicesInitialized = true;
+    }
+
+    using SRC = galois::ThreadLocalStorage<pando::Vector<T>>;
+    using DST = galois::ThreadLocalStorage<uint64_t>;
+    using SRC_Val = pando::Vector<T>;
+    using DST_Val = uint64_t;
+
+    galois::PrefixSum<SRC, DST, SRC_Val, DST_Val, transmute, scan_op, combiner, galois::Array>
+        prefixSum(m_data, m_indices);
+    PANDO_CHECK_RETURN(prefixSum.initialize(pando::getPlaceDims().node.id));
+
+    prefixSum.computePrefixSum(m_indices.size());
+    indicesComputed = true;
+
+    prefixSum.deinitialize();
+    return pando::Status::Success;
+  }
+
+  /**
+   * @brief Returns the global index that elements for host start
+   *
+   * @param host passing in `hosts + 1` is legal
+   * @param index passed by reference will hold the global index
+   */
+  [[nodiscard]] static pando::Expected<std::uint64_t> hostIndexOffset(
+      galois::ThreadLocalStorage<std::uint64_t> indices, uint64_t host) noexcept {
+    if (host == 0)
+      return static_cast<std::uint64_t>(0);
+    const auto place =
+        pando::Place(pando::NodeIndex(host), pando::PodIndex(0, 0), pando::CoreIndex(0, 0));
+    const auto idx = indices.getThreadIdxFromPlace(place, pando::ThreadIndex(0));
+    return indices[idx - 1];
+  }
+
+  [[nodiscard]] pando::Status hostFlattenAppend(galois::HostLocalStorage<pando::Vector<T>> flat) {
+    pando::Status err;
+
+    if (!indicesComputed) {
+      PANDO_CHECK_RETURN(computeIndices());
+    }
+
+    // TODO(AdityaAtulTewari) Make this properly parallel.
+    // Initialize the per host vectors
+    for (std::uint64_t i = 0; i < flat.getNumHosts(); i++) {
+      auto ref = flat[i];
+      std::uint64_t start = PANDO_EXPECT_RETURN(hostIndexOffset(m_indices, i));
+      std::uint64_t end = PANDO_EXPECT_RETURN(hostIndexOffset(m_indices, i + 1));
+      err = fmap(ref, reserve, lift(ref, size) + end - start);
+      PANDO_CHECK_RETURN(err);
+      for (std::uint64_t j = 0; j < end - start; j++) {
+        PANDO_CHECK_RETURN(fmap(ref, pushBack, T()));
+      }
+    }
+
+    auto tpl = galois::make_tpl(static_cast<ThreadLocalVector>(*this), flat);
+    // Reduce into the per host vectors
+    auto f = +[](decltype(tpl) assign, std::uint64_t i, uint64_t) {
+      auto [data, flat] = assign;
+      std::uint64_t host = i / ThreadLocalStorage<T>::getThreadsPerHost();
+      std::uint64_t start = PANDO_EXPECT_CHECK(data.hostIndexOffset(data.m_indices, host));
+      std::uint64_t curr = (i == 0) ? 0 : data.m_indices[i - 1];
+      std::uint64_t end = PANDO_EXPECT_CHECK(data.hostIndexOffset(data.m_indices, host + 1));
+
+      auto ref = flat[host];
+      pando::Vector<T> localVec = data[i];
+      std::uint64_t size = lift(ref, size) - (end - start);
+      for (T elt : localVec) {
+        fmap(ref, get, size + curr - start) = elt;
+        curr++;
+      }
+    };
+    galois::onEach(tpl, f);
+    return pando::Status::Success;
+  }
+
+private:
+  class SizeIt {
+  public:
+    SizeIt() noexcept = default;
+    SizeIt(const SizeIt&) = default;
+    SizeIt(SizeIt&&) = default;
+    ~SizeIt() = default;
+    SizeIt& operator=(const SizeIt&) = default;
+    SizeIt& operator=(SizeIt&&) = default;
+    SizeIt(ThreadLocalStorage<std::uint64_t> indices, std::uint64_t host)
+        : m_indices(indices), m_host(host) {}
+    using output_type = std::int64_t;
+    using difference_type = std::int64_t;
+
+    output_type operator*() const noexcept {
+      const std::uint64_t start = PANDO_EXPECT_CHECK(hostIndexOffset(m_indices, m_host));
+      const std::uint64_t end = PANDO_EXPECT_CHECK(hostIndexOffset(m_indices, m_host + 1));
+      return end - start;
+    }
+
+    SizeIt& operator++() {
+      m_host++;
+      return *this;
+    }
+
+    SizeIt operator++(int) {
+      SizeIt tmp = *this;
+      ++(*this);
+      return tmp;
+    }
+
+    SizeIt& operator--() {
+      m_host--;
+      return *this;
+    }
+
+    SizeIt operator--(int) {
+      SizeIt tmp = *this;
+      --(*this);
+      return tmp;
+    }
+
+    constexpr SizeIt operator+(std::uint64_t n) const noexcept {
+      return SizeIt(m_indices, m_host + n);
+    }
+
+    constexpr SizeIt& operator+=(std::uint64_t n) noexcept {
+      m_host += n;
+      return *this;
+    }
+
+    constexpr SizeIt operator-(std::uint64_t n) const noexcept {
+      return SizeIt(m_indices, m_host - n);
+    }
+
+    constexpr difference_type operator-(SizeIt b) const noexcept {
+      return m_host - b.host;
+    }
+
+    friend bool operator==(const SizeIt& a, const SizeIt& b) {
+      return a.m_host == b.m_host && a.m_indices == b.m_indices;
+    }
+
+    friend bool operator!=(const SizeIt& a, const SizeIt& b) {
+      return !(a == b);
+    }
+
+    friend bool operator<(const SizeIt& a, const SizeIt& b) {
+      return a.m_host < b.m_host;
+    }
+
+    friend bool operator<=(const SizeIt& a, const SizeIt& b) {
+      return a.m_host <= b.m_host;
+    }
+
+    friend bool operator>(const SizeIt& a, const SizeIt& b) {
+      return a.m_host > b.m_host;
+    }
+
+    friend bool operator>=(const SizeIt& a, const SizeIt& b) {
+      return a.m_host >= b.m_host;
+    }
+
+    friend pando::Place localityOf(SizeIt& a) {
+      return pando::Place{pando::NodeIndex(a.m_host), pando::anyPod, pando::anyCore};
+    }
+
+  private:
+    galois::ThreadLocalStorage<std::uint64_t> m_indices;
+    std::uint64_t m_host;
+  };
+
+  struct SizeRange {
+    using iterator = SizeIt;
+    galois::ThreadLocalStorage<std::uint64_t> m_indices;
+    SizeRange() noexcept = default;
+    SizeRange(const SizeRange&) = default;
+    SizeRange(SizeRange&&) = default;
+    ~SizeRange() = default;
+    SizeRange& operator=(const SizeRange&) = default;
+    SizeRange& operator=(SizeRange&&) = default;
+    explicit SizeRange(ThreadLocalStorage<std::uint64_t> indices) : m_indices(indices) {}
+    iterator begin() const noexcept {
+      return iterator(m_indices, 0);
+    }
+
+    iterator end() const noexcept {
+      std::uint64_t numHosts = pando::getPlaceDims().node.id;
+      return iterator(m_indices, numHosts);
+    }
+    std::uint64_t size() const noexcept {
+      return pando::getPlaceDims().node.id;
+    }
+  };
+
+public:
+  [[nodiscard]] pando::Expected<galois::HostCachedArray<T>> hostCachedFlatten() {
+    if (!indicesComputed) {
+      PANDO_CHECK_RETURN(computeIndices());
+    }
+
+    galois::HostCachedArray<T> hla;
+    // TODO(AdityaAtulTewari) Make this properly parallel.
+    // Initialize the per host vectors
+    PANDO_CHECK_RETURN(hla.initialize(SizeRange(m_indices)));
+    auto tpl = galois::make_tpl(static_cast<ThreadLocalVector>(*this), hla);
+    // Reduce into the per host vectors
+    auto f = +[](decltype(tpl) assign, std::uint64_t i, uint64_t) {
+      auto [data, flat] = assign;
+      std::uint64_t host = i / ThreadLocalStorage<T>::getThreadsPerHost();
+      std::uint64_t start = PANDO_EXPECT_CHECK(hostIndexOffset(data.m_indices, host));
+      std::uint64_t curr = (i == 0) ? 0 : data.m_indices[i - 1];
+      pando::Vector<std::uint64_t> localVec = data[i];
+      for (T elt : localVec) {
+        flat.getSpecificRef(host, curr - start) = elt;
+        curr++;
+      }
+    };
+    galois::onEach(tpl, f);
+    return hla;
+  }
+
+  iterator begin() noexcept {
+    return iterator(this->m_data, 0);
+  }
+
+  iterator begin() const noexcept {
+    return iterator(this->m_data, 0);
+  }
+
+  iterator end() noexcept {
+    return iterator(this->m_data, size());
+  }
+
+  iterator end() const noexcept {
+    return iterator(this->m_data, size());
+  }
+
+  /**
+   * @brief reverse iterator to the first element
+   */
+  reverse_iterator rbegin() noexcept {
+    return reverse_iterator(end()--);
+  }
+
+  /**
+   * @copydoc rbegin()
+   */
+  reverse_iterator rbegin() const noexcept {
+    return reverse_iterator(end()--);
+  }
+
+  /**
+   * @brief reverse iterator to the last element
+   */
+  reverse_iterator rend() noexcept {
+    return reverse_iterator(begin()--);
+  }
+
+  /**
+   * @copydoc rend()
+   */
+  reverse_iterator rend() const noexcept {
+    return reverse_iterator(begin()--);
+  }
+
+private:
+  galois::ThreadLocalStorage<pando::Vector<T>> m_data;
+  galois::ThreadLocalStorage<std::uint64_t> m_indices;
+  bool indicesInitialized = false;
+  bool indicesComputed = false;
+};
+
+} // namespace galois
+
+#endif // PANDO_LIB_GALOIS_CONTAINERS_THREAD_LOCAL_VECTOR_HPP_
diff --git a/include/pando-lib-galois/graphs/dist_local_csr.hpp b/include/pando-lib-galois/graphs/dist_local_csr.hpp
index d1528df7..9802ecb1 100644
--- a/include/pando-lib-galois/graphs/dist_local_csr.hpp
+++ b/include/pando-lib-galois/graphs/dist_local_csr.hpp
@@ -6,6 +6,7 @@
 
 #include <pando-rt/export.h>
 
+#include <unordered_set>
 #include <utility>
 
 #include <pando-lib-galois/containers/array.hpp>
@@ -46,9 +47,13 @@ struct DLCSR_InitializeState {
 
 } // namespace internal
 
+template <typename VertexType, typename EdgeType>
+class MirrorDistLocalCSR;
+
 template <typename VertexType = WMDVertex, typename EdgeType = WMDEdge>
 class DistLocalCSR {
 public:
+  friend MirrorDistLocalCSR<VertexType, EdgeType>;
   using VertexTokenID = std::uint64_t;
   using VertexTopologyID = pando::GlobalPtr<Vertex>;
   using EdgeHandle = pando::GlobalPtr<HalfEdge>;
@@ -115,11 +120,11 @@ class DistLocalCSR {
     VertexIt& operator--() {
       auto currNode = static_cast<std::uint64_t>(galois::localityOf(m_pos).node.id);
       pointer ptr = m_pos - 1;
-      CSR csrCurr = arrayOfCSRs.get(currNode);
+      CSR csrCurr = arrayOfCSRs[currNode];
       if (csrCurr.vertexEdgeOffsets.begin() <= ptr || currNode == 0) {
         m_pos = ptr;
       } else {
-        csrCurr = arrayOfCSRs.get(currNode - 1);
+        csrCurr = arrayOfCSRs[currNode - 1];
         m_pos = csrCurr.vertexEdgeOffsets.end() - 2;
       }
       return *this;
@@ -198,12 +203,12 @@ class DistLocalCSR {
     VertexDataIt& operator++() {
       auto currNode = static_cast<std::uint64_t>(galois::localityOf(m_pos).node.id);
       pointer ptr = m_pos + 1;
-      CSR csrCurr = arrayOfCSRs.get(currNode);
+      CSR csrCurr = arrayOfCSRs[currNode];
       if (csrCurr.vertexData.end() > ptr ||
           currNode == static_cast<std::uint64_t>(pando::getPlaceDims().node.id - 1)) {
         m_pos = ptr;
       } else {
-        csrCurr = arrayOfCSRs.get(currNode + 1);
+        csrCurr = arrayOfCSRs[currNode + 1];
         m_pos = csrCurr.vertexData.begin();
       }
       return *this;
@@ -218,11 +223,11 @@ class DistLocalCSR {
     VertexDataIt& operator--() {
       auto currNode = static_cast<std::uint64_t>(galois::localityOf(m_pos).node.id);
       pointer ptr = m_pos - 1;
-      CSR csrCurr = arrayOfCSRs.get(currNode);
+      CSR csrCurr = arrayOfCSRs[currNode];
       if (csrCurr.vertexData.begin() <= ptr || currNode == 0) {
         m_pos = *ptr;
       } else {
-        csrCurr = arrayOfCSRs.get(currNode - 1);
+        csrCurr = arrayOfCSRs[currNode - 1];
         m_pos = *csrCurr.vertexData.end() - 1;
       }
       return *this;
@@ -351,6 +356,8 @@ class DistLocalCSR {
 
   template <typename, typename>
   friend class DistLocalCSR;
+  template <typename, typename>
+  friend class MirrorDistLocalCSR;
 
 public:
   constexpr DistLocalCSR() noexcept = default;
@@ -392,11 +399,29 @@ class DistLocalCSR {
 
   /** Vertex Manipulation **/
   VertexTopologyID getTopologyID(VertexTokenID tid) {
+    std::uint64_t virtualHostID = tid % this->numVHosts();
+    std::uint64_t physicalHost = fmap(virtualToPhysicalMap.getLocalRef(), get, virtualHostID);
+    auto [ret, found] = fmap(getLocalCSR(), relaxedgetTopologyID, tid);
+    if (!found) {
+      return fmap(arrayOfCSRs[physicalHost], getTopologyID, tid);
+    } else {
+      return ret;
+    }
+  }
+
+private:
+  // This function is for mirrored dist local csr, or classes which will directly use it. Don't use
+  // it externally. getLocalTopologyID with non-existing tokenID will return failure.
+  VertexTopologyID getLocalTopologyID(VertexTokenID tid) {
+    return fmap(getLocalCSR(), getTopologyID, tid);
+  }
+  VertexTopologyID getGlobalTopologyID(VertexTokenID tid) {
     std::uint64_t virtualHostID = tid % this->numVHosts();
     std::uint64_t physicalHost = fmap(virtualToPhysicalMap.getLocalRef(), get, virtualHostID);
     return fmap(arrayOfCSRs[physicalHost], getTopologyID, tid);
   }
 
+public:
   VertexTopologyID getTopologyIDFromIndex(std::uint64_t index) {
     std::uint64_t hostNum = 0;
     std::uint64_t hostSize;
@@ -456,13 +481,19 @@ class DistLocalCSR {
   }
   VertexDataRange vertexDataRange() noexcept {
     return VertexDataRange{arrayOfCSRs, lift(arrayOfCSRs[0], vertexData.begin),
-                           lift(arrayOfCSRs.get(arrayOfCSRs.size() - 1), vertexData.end),
-                           numVertices};
+                           lift(arrayOfCSRs[arrayOfCSRs.size() - 1], vertexData.end), numVertices};
   }
   EdgeDataRange edgeDataRange(VertexTopologyID vertex) noexcept {
     return fmap(getCSR(vertex), edgeDataRange, vertex);
   }
 
+  /** Host Information **/
+  std::uint64_t getPhysicalHostID(VertexTokenID tid) {
+    std::uint64_t virtualHostID = tid % this->numVHosts();
+    std::uint64_t physicalHost = fmap(virtualToPhysicalMap.getLocalRef(), get, virtualHostID);
+    return physicalHost;
+  }
+
   /** Topology Modifications **/
   VertexTopologyID addVertexTopologyOnly(VertexTokenID token) {
     return vertices().end();
@@ -642,7 +673,7 @@ class DistLocalCSR {
     std::uint64_t numVertices = 0;
     if constexpr (isEdgeList) {
       for (uint64_t h = 0; h < numHosts; h++) {
-        PANDO_CHECK(fmap(pHV.get(h), initialize, 0));
+        PANDO_CHECK(fmap(pHV[h], initialize, 0));
       }
       struct PHPV {
         HostIndexedMap<pando::Vector<pando::Vector<EdgeType>>> partEdges;
@@ -651,8 +682,8 @@ class DistLocalCSR {
       PHPV phpv{partEdges, pHV};
       galois::doAllEvenlyPartition(
           phpv, numHosts, +[](PHPV phpv, uint64_t host_id, uint64_t) {
-            pando::Vector<pando::Vector<EdgeType>> edgeVec = phpv.partEdges.get(host_id);
-            pando::GlobalRef<pando::Vector<VertexType>> vertexVec = phpv.pHV.get(host_id);
+            pando::Vector<pando::Vector<EdgeType>> edgeVec = phpv.partEdges[host_id];
+            pando::GlobalRef<pando::Vector<VertexType>> vertexVec = phpv.pHV[host_id];
             for (pando::Vector<EdgeType> vec : edgeVec) {
               EdgeType e = vec[0];
               VertexType v = VertexType(e.src, agile::TYPES::NONE);
@@ -661,7 +692,7 @@ class DistLocalCSR {
           });
 
       for (uint64_t h = 0; h < numHosts; h++) {
-        numVertices += lift(pHV.get(h), size);
+        numVertices += lift(pHV[h], size);
       }
     } else {
       numVertices = numVerticesRead;
@@ -693,6 +724,62 @@ class DistLocalCSR {
     return pando::Status::Success;
   }
 
+  /**
+   * @brief This function creates a mirror list for each host. Currently it implements full
+   * mirroring
+   */
+  template <typename ReadEdgeType>
+  HostLocalStorage<pando::Array<std::uint64_t>> getMirrorList(
+      galois::HostIndexedMap<pando::Vector<pando::Vector<ReadEdgeType>>> partEdges,
+      HostLocalStorage<pando::Array<std::uint64_t>> V2PM) {
+    HostLocalStorage<pando::Array<std::uint64_t>> mirrorList;
+    PANDO_CHECK(mirrorList.initialize());
+    auto createMirrors =
+        +[](galois::HostIndexedMap<pando::Vector<pando::Vector<ReadEdgeType>>> partEdges,
+            HostLocalStorage<pando::Array<std::uint64_t>> mirrorList,
+            HostLocalStorage<pando::Array<std::uint64_t>> V2PM, std::uint64_t i,
+            galois::WaitGroup::HandleType wgh) {
+          pando::Array<uint64_t> mirrors;
+
+          // Populating the mirror list in a set to avoid duplicates
+          std::unordered_set<uint64_t> mirrorMap;
+          pando::Array<uint64_t> localV2PM = V2PM.getLocalRef();
+          for (std::uint64_t k = 0; k < lift(partEdges.getLocalRef(), size); k++) {
+            pando::Vector<ReadEdgeType> currentEdge = fmap(partEdges.getLocalRef(), get, k);
+            for (ReadEdgeType tmp : currentEdge) {
+              std::uint64_t dstVHost = tmp.dst % localV2PM.size();
+              std::uint64_t dstPHost = fmap(localV2PM, get, dstVHost);
+              if (dstPHost != i)
+                mirrorMap.insert(tmp.dst);
+            }
+          }
+          PANDO_CHECK(mirrors.initialize(mirrorMap.size()));
+
+          // TODO(Divija): Make this parallel
+          // Populate the mirror list
+          uint64_t idx = 0;
+          for (auto& mirror : mirrorMap) {
+            mirrors[idx] = mirror;
+            idx++;
+          }
+
+          mirrorList.getLocalRef() = mirrors;
+          wgh.done();
+        };
+
+    std::uint64_t numHosts = static_cast<std::uint64_t>(pando::getPlaceDims().node.id);
+    galois::WaitGroup wg;
+    PANDO_CHECK(wg.initialize(numHosts));
+    auto wgh = wg.getHandle();
+    for (std::uint64_t i = 0; i < numHosts; i++) {
+      pando::Place place = pando::Place{pando::NodeIndex{static_cast<std::int16_t>(i)},
+                                        pando::anyPod, pando::anyCore};
+      PANDO_CHECK(pando::executeOn(place, createMirrors, partEdges, mirrorList, V2PM, i, wgh));
+    }
+    PANDO_CHECK(wg.wait());
+    return mirrorList;
+  }
+
   /**
    * @brief This initializer for workflow 4's edge lists
    */
@@ -814,7 +901,7 @@ class DistLocalCSR {
       }
       currentCSR.vertexEdgeOffsets[vertex] = Vertex{&currentCSR.edgeDestinations[currLocalEdge]};
 
-      arrayOfCSRs.get(host) = currentCSR;
+      arrayOfCSRs[host] = currentCSR;
       edgesStart = edgesEnd;
     }
     edgeCounts.deinitialize();
@@ -822,7 +909,7 @@ class DistLocalCSR {
 
     edgesStart = 0;
     for (uint64_t host = 0; host < hosts; host++) {
-      CSR currentCSR = arrayOfCSRs.get(host);
+      CSR currentCSR = arrayOfCSRs[host];
 
       uint64_t lastLocalVertexIndex = verticesPerHost * (host + 1) - 1;
       if (lastLocalVertexIndex >= numVertices) {
@@ -844,7 +931,7 @@ class DistLocalCSR {
           currEdge = edges[edgesStart + currLocalEdge + 1];
         }
       }
-      arrayOfCSRs.get(host) = currentCSR;
+      arrayOfCSRs[host] = currentCSR;
 
       edgesStart += currLocalEdge;
     }
@@ -923,7 +1010,7 @@ class DistLocalCSR {
     galois::onEach(
         state2, +[](InitializeEdgeState& state, uint64_t thread, uint64_t) {
           uint64_t host = static_cast<std::uint64_t>(pando::getCurrentNode().id);
-          CSR currentCSR = state.dlcsr.arrayOfCSRs.get(host);
+          CSR currentCSR = state.dlcsr.arrayOfCSRs[host];
 
           uint64_t hostOffset;
           PANDO_CHECK(state.edges.currentHostIndexOffset(hostOffset));
@@ -955,8 +1042,8 @@ class DistLocalCSR {
     galois::PerThreadVector<pando::Vector<EdgeType>> localEdges;
     PANDO_CHECK_RETURN(localEdges.initialize());
 
-    galois::DistArray<galois::HashTable<std::uint64_t, std::uint64_t>> perThreadRename;
-    PANDO_CHECK(perThreadRename.initialize(localEdges.size()));
+    galois::ThreadLocalStorage<galois::HashTable<std::uint64_t, std::uint64_t>> perThreadRename;
+    PANDO_CHECK(perThreadRename.initialize());
 
     for (auto hashRef : perThreadRename) {
       hashRef = galois::HashTable<std::uint64_t, std::uint64_t>{};
@@ -1000,13 +1087,14 @@ class DistLocalCSR {
 
 #ifdef FREE
     auto freePerThreadRename =
-        +[](galois::DistArray<galois::HashTable<std::uint64_t, std::uint64_t>> perThreadRename) {
+        +[](galois::ThreadLocalStorage<galois::HashTable<std::uint64_t, std::uint64_t>>
+                perThreadRename) {
           for (galois::HashTable<std::uint64_t, std::uint64_t> hash : perThreadRename) {
             hash.deinitialize();
           }
-          perThreadRename.deinitialize();
         };
     PANDO_CHECK(pando::executeOn(pando::anyPlace, freePerThreadRename, perThreadRename));
+    perThreadRename.deinitialize();
 #endif
 
     const bool isEdgeList = false;
@@ -1020,7 +1108,7 @@ class DistLocalCSR {
    */
   std::uint64_t getVertexLocalIndex(VertexTopologyID vertex) {
     std::uint64_t hostNum = static_cast<std::uint64_t>(galois::localityOf(vertex).node.id);
-    return fmap(arrayOfCSRs.get(hostNum), getVertexIndex, vertex);
+    return fmap(arrayOfCSRs[hostNum], getVertexIndex, vertex);
   }
 
   /**
@@ -1028,7 +1116,7 @@ class DistLocalCSR {
    */
 
   std::uint64_t localSize(std::uint32_t host) noexcept {
-    return lift(arrayOfCSRs.get(host), size);
+    return lift(arrayOfCSRs[host], size);
   }
 
   /**
@@ -1065,7 +1153,7 @@ class DistLocalCSR {
    */
   pando::GlobalRef<CSR> getLocalCSR() {
     std::uint64_t nodeIdx = static_cast<std::uint64_t>(pando::getCurrentPlace().node.id);
-    return arrayOfCSRs.get(nodeIdx);
+    return arrayOfCSRs[nodeIdx];
   }
 
 private:
diff --git a/include/pando-lib-galois/graphs/graph_traits.hpp b/include/pando-lib-galois/graphs/graph_traits.hpp
index a804be1b..70031eb3 100644
--- a/include/pando-lib-galois/graphs/graph_traits.hpp
+++ b/include/pando-lib-galois/graphs/graph_traits.hpp
@@ -196,5 +196,26 @@ struct graph_checker {
       sizeof(addEdgesTopologyOnly(0)) == sizeof(Yes) && sizeof(addEdges(0)) == sizeof(Yes) &&
       sizeof(deleteEdges(0)) == sizeof(Yes);
 };
+
+/**
+ * @brief this is the graph interface, methods from here should mostly be used
+ */
+template <typename G, typename VertexTokenID, typename VertexTopologyID, typename EdgeHandle,
+          typename VertexData, typename EdgeData, typename VertexRange, typename EdgeRange,
+          typename VertexDataRange, typename EdgeDataRange>
+struct gluon_graph {
+  /** Size **/
+  std::uint64_t getMasterSize();
+  std::uint64_t getMasterSize() const noexcept;
+  std::uint64_t getMirrorSize();
+  std::uint64_t getMirrorSize() const noexcept;
+
+  /** Range **/
+  VertexRange getMasterRange();
+  VertexRange getMirrorRange();
+
+  /** Sync **/
+  // template <typename Func> pando::Array<bool> sync(Func func, pando::Array<bool>);
+};
 } // namespace galois
 #endif // PANDO_LIB_GALOIS_GRAPHS_GRAPH_TRAITS_HPP_
diff --git a/include/pando-lib-galois/graphs/local_csr.hpp b/include/pando-lib-galois/graphs/local_csr.hpp
index f8a0e7b3..87ab0ed2 100644
--- a/include/pando-lib-galois/graphs/local_csr.hpp
+++ b/include/pando-lib-galois/graphs/local_csr.hpp
@@ -9,6 +9,8 @@
 #include <pando-lib-galois/containers/hashtable.hpp>
 #include <pando-lib-galois/graphs/graph_traits.hpp>
 #include <pando-lib-galois/loops/do_all.hpp>
+#include <pando-lib-galois/utility/pair.hpp>
+#include <pando-lib-galois/utility/tuple.hpp>
 #include <pando-rt/containers/array.hpp>
 #include <pando-rt/pando-rt.hpp>
 
@@ -233,10 +235,14 @@ class RefSpan {
 template <typename VertexType, typename EdgeType>
 class DistLocalCSR;
 
+template <typename VertexType, typename EdgeType>
+class MirrorDistLocalCSR;
+
 template <typename VertexType, typename EdgeType>
 class LCSR {
 public:
   friend DistLocalCSR<VertexType, EdgeType>;
+  friend MirrorDistLocalCSR<VertexType, EdgeType>;
   using VertexTokenID = std::uint64_t;
   using VertexTopologyID = pando::GlobalPtr<Vertex>;
   using EdgeHandle = pando::GlobalPtr<HalfEdge>;
@@ -409,9 +415,21 @@ class LCSR {
   }
 
   /** Vertex Manipulation **/
+private:
+  // Use with your own risk.
+  // It is reasonable only when you could handle the non-existing value outside of this function.
+  galois::Pair<VertexTopologyID, bool> relaxedgetTopologyID(VertexTokenID token) {
+    pando::GlobalPtr<Vertex> ret;
+    bool found = tokenToTopology.get(token, ret);
+    return galois::make_tpl(ret, found);
+  }
+
+public:
   VertexTopologyID getTopologyID(VertexTokenID token) {
     pando::GlobalPtr<Vertex> ret;
     if (!tokenToTopology.get(token, ret)) {
+      std::cout << "In the host " << pando::getCurrentPlace().node.id
+                << "can't find token:" << token << std::endl;
       PANDO_ABORT("FAILURE TO FIND TOKENID");
     }
     return ret;
diff --git a/include/pando-lib-galois/graphs/mirror_dist_local_csr.hpp b/include/pando-lib-galois/graphs/mirror_dist_local_csr.hpp
new file mode 100644
index 00000000..d1216922
--- /dev/null
+++ b/include/pando-lib-galois/graphs/mirror_dist_local_csr.hpp
@@ -0,0 +1,456 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023. University of Texas at Austin. All rights reserved.
+
+#ifndef PANDO_LIB_GALOIS_GRAPHS_MIRROR_DIST_LOCAL_CSR_HPP_
+#define PANDO_LIB_GALOIS_GRAPHS_MIRROR_DIST_LOCAL_CSR_HPP_
+
+#include <pando-rt/export.h>
+
+#include <utility>
+
+#include "pando-rt/sync/mutex.hpp"
+#include <pando-lib-galois/containers/hashtable.hpp>
+#include <pando-lib-galois/containers/host_indexed_map.hpp>
+#include <pando-lib-galois/containers/host_local_storage.hpp>
+#include <pando-lib-galois/containers/per_thread.hpp>
+#include <pando-lib-galois/graphs/dist_local_csr.hpp>
+#include <pando-lib-galois/graphs/local_csr.hpp>
+#include <pando-lib-galois/import/wmd_graph_importer.hpp>
+#include <pando-lib-galois/loops/do_all.hpp>
+#include <pando-lib-galois/utility/gptr_monad.hpp>
+#include <pando-rt/containers/array.hpp>
+#include <pando-rt/containers/vector.hpp>
+#include <pando-rt/memory/memory_guard.hpp>
+#include <pando-rt/pando-rt.hpp>
+
+#define FREE 1
+
+namespace galois {
+
+namespace internal {
+
+template <typename VertexType, typename EdgeType>
+struct MDLCSR_InitializeState {
+  using CSR = LCSR<VertexType, EdgeType>;
+
+  MDLCSR_InitializeState() = default;
+  MDLCSR_InitializeState(galois::HostIndexedMap<CSR> arrayOfCSRs_,
+                         galois::PerThreadVector<VertexType> vertices_,
+                         galois::PerThreadVector<EdgeType> edges_,
+                         galois::PerThreadVector<uint64_t> edgeCounts_)
+      : arrayOfCSRs(arrayOfCSRs_), vertices(vertices_), edges(edges_), edgeCounts(edgeCounts_) {}
+
+  galois::HostIndexedMap<CSR> arrayOfCSRs;
+  galois::PerThreadVector<VertexType> vertices;
+  galois::PerThreadVector<EdgeType> edges;
+  galois::PerThreadVector<uint64_t> edgeCounts;
+};
+
+} // namespace internal
+
+template <typename VertexType = WMDVertex, typename EdgeType = WMDEdge>
+class MirrorDistLocalCSR {
+public:
+  using VertexTokenID = std::uint64_t;
+  using VertexTopologyID = pando::GlobalPtr<Vertex>;
+  using EdgeHandle = pando::GlobalPtr<HalfEdge>;
+  using VertexData = VertexType;
+  using EdgeData = EdgeType;
+  using EdgeRange = RefSpan<HalfEdge>;
+  using EdgeDataRange = pando::Span<EdgeData>;
+  using CSR = LCSR<VertexType, EdgeType>;
+  using DLCSR = DistLocalCSR<VertexType, EdgeType>;
+  using VertexRange = typename DLCSR::VertexRange;
+  using VertexDataRange = typename DLCSR::VertexDataRange;
+  using LocalVertexRange = typename CSR::VertexRange;
+  using LocalVertexDataRange = typename CSR::VertexDataRange;
+
+private:
+  template <typename T>
+  pando::GlobalRef<CSR> getCSR(pando::GlobalPtr<T> ptr) {
+    return dlcsr.getCSR(ptr);
+  }
+
+  EdgeHandle halfEdgeBegin(VertexTopologyID vertex) {
+    return dlcsr.halfEdgeBegin(vertex);
+  }
+
+  EdgeHandle halfEdgeEnd(VertexTopologyID vertex) {
+    return dlcsr.halfEdgeEnd(vertex);
+  }
+
+  std::uint64_t numVHosts() {
+    return dlcsr.numVHosts();
+  }
+
+public:
+  constexpr MirrorDistLocalCSR() noexcept = default;
+  constexpr MirrorDistLocalCSR(MirrorDistLocalCSR&&) noexcept = default;
+  constexpr MirrorDistLocalCSR(const MirrorDistLocalCSR&) noexcept = default;
+  ~MirrorDistLocalCSR() = default;
+
+  constexpr MirrorDistLocalCSR& operator=(const MirrorDistLocalCSR&) noexcept = default;
+  constexpr MirrorDistLocalCSR& operator=(MirrorDistLocalCSR&&) noexcept = default;
+
+  /** Official Graph APIS **/
+  void deinitialize() {
+    dlcsr.deinitialize();
+  }
+
+  /** size stuff **/
+  std::uint64_t size() noexcept {
+    return dlcsr.size() - _mirror_size;
+  }
+  std::uint64_t size() const noexcept {
+    return dlcsr.size() - _mirror_size;
+  }
+  std::uint64_t sizeEdges() noexcept {
+    return dlcsr.sizeEdges();
+  }
+  std::uint64_t sizeEdges() const noexcept {
+    return dlcsr.sizeEdges();
+  }
+  std::uint64_t getNumEdges(VertexTopologyID vertex) {
+    return dlcsr.getNumEdges(vertex);
+  }
+  std::uint64_t sizeMirrors() noexcept {
+    return _mirror_size;
+  }
+  std::uint64_t sizeMirrors() const noexcept {
+    return _mirror_size;
+  }
+
+  struct MirrorToMasterMap {
+    MirrorToMasterMap() = default;
+    MirrorToMasterMap(VertexTopologyID _mirror, VertexTopologyID _master)
+        : mirror(_mirror), master(_master) {}
+    VertexTopologyID mirror;
+    VertexTopologyID master;
+    VertexTopologyID getMirror() {
+      return mirror;
+    }
+    VertexTopologyID getMaster() {
+      return master;
+    }
+  };
+
+  /** Vertex Manipulation **/
+  VertexTopologyID getTopologyID(VertexTokenID tid) {
+    return dlcsr.getTopologyID(tid);
+  }
+
+  VertexTopologyID getLocalTopologyID(VertexTokenID tid) {
+    return dlcsr.getLocalTopologyID(tid);
+  }
+
+  VertexTopologyID getGlobalTopologyID(VertexTokenID tid) {
+    return dlcsr.getGlobalTopologyID(tid);
+  }
+
+  pando::Array<MirrorToMasterMap> getLocalMirrorToRemoteMasterOrderedTable() {
+    return localMirrorToRemoteMasterOrderedTable.getLocalRef();
+  }
+
+  VertexTopologyID getTopologyIDFromIndex(std::uint64_t index) {
+    return dlcsr.getTopologyIDFromIndex(index);
+  }
+  VertexTokenID getTokenID(VertexTopologyID tid) {
+    return dlcsr.getTokenID(tid);
+  }
+  std::uint64_t getVertexIndex(VertexTopologyID vertex) {
+    return dlcsr.getVertexIndex(vertex);
+  }
+  pando::Place getLocalityVertex(VertexTopologyID vertex) {
+    return dlcsr.getLocalityVertex(vertex);
+  }
+
+  /** Edge Manipulation **/
+  EdgeHandle mintEdgeHandle(VertexTopologyID vertex, std::uint64_t off) {
+    return dlcsr.mintEdgeHandle(vertex, off);
+  }
+  VertexTopologyID getEdgeDst(EdgeHandle eh) {
+    return dlcsr.getEdgeDst(eh);
+  }
+
+  /** Data Manipulations **/
+  void setData(VertexTopologyID vertex, VertexData data) {
+    dlcsr.setData(vertex, data);
+  }
+  pando::GlobalRef<VertexData> getData(VertexTopologyID vertex) {
+    return dlcsr.getData(vertex);
+  }
+  void setEdgeData(EdgeHandle eh, EdgeData data) {
+    dlcsr.setEdgeData(eh, data);
+  }
+  pando::GlobalRef<EdgeData> getEdgeData(EdgeHandle eh) {
+    return dlcsr.getEdgeData(eh);
+  }
+
+  /** Ranges **/
+  VertexRange vertices() {
+    // This will include all mirrored vertices
+    return dlcsr.vertices();
+  }
+
+  EdgeRange edges(pando::GlobalPtr<galois::Vertex> vPtr) {
+    return dlcsr.edges(vPtr);
+  }
+  VertexDataRange vertexDataRange() noexcept {
+    return dlcsr.vertexDataRange();
+  }
+  EdgeDataRange edgeDataRange(VertexTopologyID vertex) noexcept {
+    return dlcsr.edgeDataRange(vertex);
+  }
+
+  /** Topology Modifications **/
+  VertexTopologyID addVertexTopologyOnly(VertexTokenID token) {
+    return dlcsr.addVertexTopologyOnly(token);
+  }
+  VertexTopologyID addVertex(VertexTokenID token, VertexData data) {
+    return dlcsr.addVertex(token, data);
+  }
+  pando::Status addEdgesTopologyOnly(VertexTopologyID src, pando::Vector<VertexTopologyID> dsts) {
+    return dlcsr.addEdgesTopologyOnly(src, dsts);
+  }
+  pando::Status addEdges(VertexTopologyID src, pando::Vector<VertexTopologyID> dsts,
+                         pando::Vector<EdgeData> data) {
+    return dlcsr.addEdges(src, dsts, data);
+  }
+  pando::Status deleteEdges(VertexTopologyID src, pando::Vector<EdgeHandle> edges) {
+    return dlcsr.deleteEdges(src, edges);
+  }
+
+  /** Gluon Graph APIS **/
+
+  /** Size **/
+  std::uint64_t getMasterSize() noexcept {
+    return lift(masterRange.getLocalRef(), size);
+  }
+  std::uint64_t getMirrorSize() noexcept {
+    return lift(mirrorRange.getLocalRef(), size);
+  }
+
+  /** Range **/
+  LocalVertexRange getMasterRange() {
+    return masterRange.getLocalRef();
+  }
+  LocalVertexRange getMirrorRange() {
+    return mirrorRange.getLocalRef();
+  }
+
+  /** Host Information **/
+  std::uint64_t getPhysicalHostID(VertexTokenID tid) {
+    return dlcsr.getPhysicalHostID(tid);
+  }
+
+  /** Sync **/
+  // TODO(Ying-Wei):
+  // write a sync function that reduces mirror values and then broadcasts master values
+  // return a bitmap of modified vertices
+  //
+  // template <typename Func>
+  // pando::Array<bool> sync(Func func, pando::Array<bool>) {
+  //}
+
+  /**
+   * @brief get vertex local dense ID
+   */
+  std::uint64_t getVertexLocalIndex(VertexTopologyID vertex) {
+    return dlcsr.getVertexIndex(vertex);
+  }
+
+  /**
+   * @brief gives the number of edges
+   */
+
+  std::uint64_t localSize(std::uint32_t host) noexcept {
+    return dlcsr.localSize(host);
+  }
+
+  /**
+   * @brief Sets the value of the edge provided
+   */
+  void setEdgeData(VertexTopologyID vertex, std::uint64_t off, EdgeData data) {
+    dlcsr.setEdgeData(mintEdgeHandle(vertex, off), data);
+  }
+
+  /**
+   * @brief gets the reference to the vertex provided
+   */
+  pando::GlobalRef<EdgeData> getEdgeData(VertexTopologyID vertex, std::uint64_t off) {
+    return dlcsr.getEdgeData(mintEdgeHandle(vertex, off));
+  }
+
+  /**
+   * @brief get the vertex at the end of the edge provided by vertex at the offset from the start
+   */
+  VertexTopologyID getEdgeDst(VertexTopologyID vertex, std::uint64_t off) {
+    return dlcsr.getEdgeDst(mintEdgeHandle(vertex, off));
+  }
+
+  bool isLocal(VertexTopologyID vertex) {
+    return dlcsr.isLocal(vertex);
+  }
+
+  bool isOwned(VertexTopologyID vertex) {
+    return dlcsr.isOwned(vertex);
+  }
+
+  /**
+   * @brief Get the local csr
+   */
+  pando::GlobalRef<CSR> getLocalCSR() {
+    return dlcsr.getLocalCSR();
+  }
+
+  // TODO(Jeageun):
+  // write a initialize function that calls initializeAfterGather function of DistLocalCSR dlcsr
+  template <typename ReadVertexType, typename ReadEdgeType>
+  pando::Status initializeAfterGather(
+      galois::HostIndexedMap<pando::Vector<ReadVertexType>> vertexData, std::uint64_t numVertices,
+      galois::HostIndexedMap<pando::Vector<pando::Vector<ReadEdgeType>>> edgeData,
+      galois::HostIndexedMap<galois::HashTable<std::uint64_t, std::uint64_t>> edgeMap,
+      galois::HostIndexedMap<std::uint64_t> numEdges,
+      HostLocalStorage<pando::Array<std::uint64_t>> virtualToPhysical) {
+    std::uint64_t numHosts = static_cast<std::uint64_t>(pando::getPlaceDims().node.id);
+    galois::WaitGroup wg;
+    PANDO_CHECK(wg.initialize(numHosts));
+    auto wgh = wg.getHandle();
+    _mirror_size = 0;
+    HostLocalStorage<pando::Array<VertexTokenID>> mirrorList;
+    mirrorList = this->dlcsr.getMirrorList(edgeData, virtualToPhysical);
+    PANDO_CHECK(masterRange.initialize());
+    PANDO_CHECK(mirrorRange.initialize());
+    PANDO_CHECK(localMirrorToRemoteMasterOrderedTable.initialize());
+
+    auto mirrorAttach = +[](galois::HostIndexedMap<pando::Vector<ReadVertexType>> vertexData,
+                            HostLocalStorage<pando::Array<VertexTokenID>> mirrorList,
+                            std::uint64_t i, galois::WaitGroup::HandleType wgh) {
+      pando::Vector<ReadVertexType> curVertexData = vertexData[i];
+      pando::Array<VertexTokenID> curMirrorList = mirrorList[i];
+      for (uint64_t j = 0; j < lift(curMirrorList, size); j++) {
+        ReadVertexType v = ReadVertexType{curMirrorList[j]};
+        PANDO_CHECK(fmap(curVertexData, pushBack, v));
+      }
+      vertexData[i] = curVertexData;
+      wgh.done();
+    };
+    uint64_t local_mirror_size = 0;
+    for (std::uint64_t i = 0; i < numHosts; i++) {
+      pando::Place place = pando::Place{pando::NodeIndex{static_cast<std::int16_t>(i)},
+                                        pando::anyPod, pando::anyCore};
+      PANDO_CHECK(pando::executeOn(place, mirrorAttach, vertexData, mirrorList, i, wgh));
+      local_mirror_size = lift(mirrorList[i], size);
+      numVertices += local_mirror_size;
+      _mirror_size += local_mirror_size;
+    }
+    PANDO_CHECK(wg.wait());
+    wgh.add(numHosts);
+
+    this->dlcsr.initializeAfterGather(vertexData, numVertices, edgeData, edgeMap, numEdges,
+                                      virtualToPhysical);
+
+    // Generate masterRange, mirrorRange, localMirrorToRemoteMasterOrderedTable
+    auto generateMetadata = +[](MirrorDistLocalCSR<VertexType, EdgeType> mdlcsr,
+                                DistLocalCSR<VertexType, EdgeType> dlcsr,
+                                HostLocalStorage<pando::Array<std::uint64_t>> mirrorList,
+                                std::uint64_t i, galois::WaitGroup::HandleType wgh) {
+      pando::Array<std::uint64_t> localMirrorList = mirrorList[i];
+      uint64_t mirror_size = lift(localMirrorList, size);
+      CSR csrCurr = dlcsr.arrayOfCSRs[i];
+
+      LocalVertexRange _masterRange = mdlcsr.masterRange.getLocalRef();
+      _masterRange = LocalVertexRange(lift(csrCurr, vertexEdgeOffsets.begin),
+                                      lift(csrCurr, size) - mirror_size);
+
+      LocalVertexRange _mirrorRange = mdlcsr.mirrorRange.getLocalRef();
+      _mirrorRange = LocalVertexRange(
+          lift(csrCurr, vertexEdgeOffsets.begin) + lift(csrCurr, size) - mirror_size, mirror_size);
+
+      pando::Array<MirrorToMasterMap> _localMirrorToRemoteMasterOrderedTable =
+          mdlcsr.localMirrorToRemoteMasterOrderedTable.getLocalRef();
+      fmap(_localMirrorToRemoteMasterOrderedTable, initialize, mirror_size);
+      for (uint64_t j = 0; j < mirror_size; j++) {
+        _localMirrorToRemoteMasterOrderedTable[j] =
+            MirrorToMasterMap(fmap(dlcsr, getLocalTopologyID, localMirrorList[j]),
+                              fmap(dlcsr, getGlobalTopologyID, localMirrorList[j]));
+      }
+      mdlcsr.masterRange.getLocalRef() = _masterRange;
+      mdlcsr.mirrorRange.getLocalRef() = _mirrorRange;
+      mdlcsr.localMirrorToRemoteMasterOrderedTable.getLocalRef() =
+          _localMirrorToRemoteMasterOrderedTable;
+      wgh.done();
+    };
+
+    for (std::uint64_t i = 0; i < numHosts; i++) {
+      pando::Place place = pando::Place{pando::NodeIndex{static_cast<std::int16_t>(i)},
+                                        pando::anyPod, pando::anyCore};
+      PANDO_CHECK(
+          pando::executeOn(place, generateMetadata, *this, this->dlcsr, mirrorList, i, wgh));
+      numVertices += lift(mirrorList[i], size);
+    }
+    PANDO_CHECK(wg.wait());
+    return pando::Status::Success;
+  }
+
+  // TODO(Ying-Wei):
+  // uses doAll to send remoteMasterToLocalMirrorMap to corresponding remote hosts
+  // no need to use executeON
+  // just push to the localMasterToRemoteMirrorOrderedTable vector
+  // make sure to use the spin lock in pando-rt
+  /**
+   * @brief Get the local mutex
+   */
+  pando::GlobalRef<pando::Mutex> getLocalMutex(std::uint64_t host_id) {
+    return hostMutex[host_id];
+  }
+
+  pando::Status setupCommunication() {
+    PANDO_CHECK_RETURN(localMasterToRemoteMirrorTable.initialize());
+
+    PANDO_CHECK_RETURN(hostMutex.initialize());
+
+    PANDO_CHECK_RETURN(galois::doAll(
+        localMirrorToRemoteMasterOrderedTable, localMasterToRemoteMirrorTable,
+        +[](galois::HostLocalStorage<pando::Array<MirrorToMasterMap>>
+                localMirrorToRemoteMasterOrderedTable,
+            pando::GlobalRef<pando::Vector<EdgeHandle>> localMasterToRemoteMirrorTable) {
+          PANDO_CHECK(fmap(localMirrorToRemoteMasterOrderedTable, initialize, 0));
+          pando::Array<MirrorToMasterMap> remoteMasterToLocalMirrorMap =
+              localMirrorToRemoteMasterOrderedTable.getLocal();
+          for (MirrorToMasterMap m : remoteMasterToLocalMirrorMap) {
+            VertexTopologyID masterTopologyID = m.master;
+            VertexTokenID masterTokenID = getTokenID(masterTopologyID);
+            std::uint64_t physicalHost = getPhysicalHostID(masterTokenID);
+            pando::Mutex mutex = getLocalMutex(physicalHost);
+
+            // Lock mutex to ensure atomic append to the vector
+            mutex.lock();
+            PANDO_CHECK(fmap(localMasterToRemoteMirrorTable, pushBack, m));
+            mutex.unlock();
+          }
+        }));
+
+    return pando::Status::Success;
+  }
+
+private:
+  DLCSR dlcsr;
+  uint64_t _mirror_size;
+  galois::HostLocalStorage<LocalVertexRange> masterRange;
+  galois::HostLocalStorage<LocalVertexRange> mirrorRange;
+  galois::HostLocalStorage<pando::Array<MirrorToMasterMap>> localMirrorToRemoteMasterOrderedTable;
+
+  // TODO(Ying-Wei):
+  // generate the following
+  galois::HostLocalStorage<pando::Mutex> hostMutex;
+  galois::HostLocalStorage<pando::Vector<EdgeHandle>> localMasterToRemoteMirrorTable;
+  // galois::GlobalBarrier barrier;
+};
+
+static_assert(graph_checker<MirrorDistLocalCSR<std::uint64_t, std::uint64_t>>::value);
+static_assert(graph_checker<MirrorDistLocalCSR<WMDVertex, WMDEdge>>::value);
+
+} // namespace galois
+
+#endif // PANDO_LIB_GALOIS_GRAPHS_MIRROR_DIST_LOCAL_CSR_HPP_
diff --git a/include/pando-lib-galois/import/ingest_rmat_el.hpp b/include/pando-lib-galois/import/ingest_rmat_el.hpp
index e8fd90b9..c68bb6b3 100644
--- a/include/pando-lib-galois/import/ingest_rmat_el.hpp
+++ b/include/pando-lib-galois/import/ingest_rmat_el.hpp
@@ -8,7 +8,9 @@
 
 #include <pando-lib-galois/containers/dist_array.hpp>
 #include <pando-lib-galois/containers/hashtable.hpp>
+#include <pando-lib-galois/containers/thread_local_storage.hpp>
 #include <pando-lib-galois/graphs/dist_local_csr.hpp>
+#include <pando-lib-galois/graphs/mirror_dist_local_csr.hpp>
 #include <pando-rt/memory/memory_guard.hpp>
 
 namespace galois {
@@ -32,7 +34,7 @@ void loadELFilePerThread(
     galois::WaitGroup::HandleType wgh, pando::Array<char> filename, std::uint64_t segmentsPerThread,
     std::uint64_t numThreads, std::uint64_t threadID,
     galois::PerThreadVector<pando::Vector<ELEdge>> localEdges,
-    galois::DistArray<galois::HashTable<std::uint64_t, std::uint64_t>> perThreadRename,
+    galois::ThreadLocalStorage<galois::HashTable<std::uint64_t, std::uint64_t>> perThreadRename,
     std::uint64_t numVertices);
 
 const char* elGetOne(const char* line, std::uint64_t& val);
@@ -50,17 +52,16 @@ pando::Status generateEdgesPerVirtualHost(pando::GlobalRef<pando::Vector<ELVerte
                                           std::uint64_t totalVertices, std::uint64_t vHostID,
                                           std::uint64_t numVHosts);
 
-template <typename VertexType, typename EdgeType>
-galois::DistLocalCSR<VertexType, EdgeType> initializeELDLCSR(pando::Array<char> filename,
-                                                             std::uint64_t numVertices,
-                                                             std::uint64_t vHostsScaleFactor = 8) {
+template <typename ReturnType, typename VertexType, typename EdgeType>
+ReturnType initializeELDLCSR(pando::Array<char> filename, std::uint64_t numVertices,
+                             std::uint64_t vHostsScaleFactor = 8) {
   galois::PerThreadVector<pando::Vector<ELEdge>> localEdges;
   PANDO_CHECK(localEdges.initialize());
 
   const std::uint64_t numThreads = localEdges.size() - pando::getPlaceDims().node.id;
 
-  galois::DistArray<galois::HashTable<std::uint64_t, std::uint64_t>> perThreadRename;
-  PANDO_CHECK(perThreadRename.initialize(localEdges.size()));
+  galois::ThreadLocalStorage<galois::HashTable<std::uint64_t, std::uint64_t>> perThreadRename;
+  PANDO_CHECK(perThreadRename.initialize());
 
   for (auto hashRef : perThreadRename) {
     hashRef = galois::HashTable<std::uint64_t, std::uint64_t>{};
@@ -92,13 +93,14 @@ galois::DistLocalCSR<VertexType, EdgeType> initializeELDLCSR(pando::Array<char>
 
 #ifdef FREE
   auto freePerThreadRename =
-      +[](galois::DistArray<galois::HashTable<std::uint64_t, std::uint64_t>> perThreadRename) {
+      +[](galois::ThreadLocalStorage<galois::HashTable<std::uint64_t, std::uint64_t>>
+              perThreadRename) {
         for (galois::HashTable<std::uint64_t, std::uint64_t> hash : perThreadRename) {
           hash.deinitialize();
         }
-        perThreadRename.deinitialize();
       };
   PANDO_CHECK(pando::executeOn(pando::anyPlace, freePerThreadRename, perThreadRename));
+  perThreadRename.deinitialize();
 #endif
 
   PANDO_CHECK(
@@ -146,7 +148,7 @@ galois::DistLocalCSR<VertexType, EdgeType> initializeELDLCSR(pando::Array<char>
   auto [partEdges, renamePerHost] =
       internal::partitionEdgesParallely(pHV, std::move(localEdges), v2PM);
 
-  using Graph = galois::DistLocalCSR<VertexType, EdgeType>;
+  using Graph = ReturnType;
   Graph graph;
   graph.template initializeAfterGather<galois::ELVertex, galois::ELEdge>(
       pHV, numVertices, partEdges, renamePerHost, numEdges,
diff --git a/include/pando-lib-galois/import/ingest_wmd_csv.hpp b/include/pando-lib-galois/import/ingest_wmd_csv.hpp
index 1139c343..d5144279 100644
--- a/include/pando-lib-galois/import/ingest_wmd_csv.hpp
+++ b/include/pando-lib-galois/import/ingest_wmd_csv.hpp
@@ -6,6 +6,7 @@
 
 #include <utility>
 
+#include <pando-lib-galois/containers/thread_local_vector.hpp>
 #include <pando-lib-galois/graphs/dist_local_csr.hpp>
 #include <pando-rt/memory/memory_guard.hpp>
 #include <pando-rt/tracing.hpp>
@@ -15,9 +16,9 @@ namespace galois {
 void loadWMDFilePerThread(
     galois::WaitGroup::HandleType wgh, pando::Array<char> filename, std::uint64_t segmentsPerThread,
     std::uint64_t numThreads, std::uint64_t threadID,
-    galois::PerThreadVector<pando::Vector<WMDEdge>> localEdges,
-    galois::DistArray<galois::HashTable<std::uint64_t, std::uint64_t>> perThreadRename,
-    galois::PerThreadVector<WMDVertex> localVertices, galois::DAccumulator<std::uint64_t> totVerts);
+    PerThreadVector<pando::Vector<WMDEdge>> localEdges,
+    ThreadLocalStorage<HashTable<std::uint64_t, std::uint64_t>> perThreadRename,
+    ThreadLocalVector<WMDVertex> localReadVertices, galois::DAccumulator<std::uint64_t> totVerts);
 
 template <typename VertexFunc, typename EdgeFunc>
 pando::Status wmdCSVParse(const char* line, pando::Array<galois::StringView> tokens,
@@ -65,8 +66,8 @@ galois::DistLocalCSR<VertexType, EdgeType> initializeWMDDLCSR(pando::Array<char>
   galois::PerThreadVector<pando::Vector<WMDEdge>> localEdges;
   PANDO_CHECK(localEdges.initialize());
 
-  galois::PerThreadVector<WMDVertex> localVertices;
-  PANDO_CHECK(localVertices.initialize());
+  galois::ThreadLocalVector<WMDVertex> localReadVertices;
+  PANDO_CHECK(localReadVertices.initialize());
 
   const std::uint64_t numThreads = localEdges.size() - pando::getPlaceDims().node.id;
   const std::uint64_t hosts = static_cast<std::uint64_t>(pando::getPlaceDims().node.id);
@@ -79,8 +80,8 @@ galois::DistLocalCSR<VertexType, EdgeType> initializeWMDDLCSR(pando::Array<char>
   galois::DAccumulator<std::uint64_t> totVerts;
   PANDO_CHECK(totVerts.initialize());
 
-  galois::DistArray<galois::HashTable<std::uint64_t, std::uint64_t>> perThreadRename{};
-  PANDO_CHECK(perThreadRename.initialize(localEdges.size()));
+  galois::ThreadLocalStorage<galois::HashTable<std::uint64_t, std::uint64_t>> perThreadRename{};
+  PANDO_CHECK(perThreadRename.initialize());
 
   for (auto hashRef : perThreadRename) {
     hashRef = galois::HashTable<std::uint64_t, std::uint64_t>{};
@@ -92,7 +93,7 @@ galois::DistLocalCSR<VertexType, EdgeType> initializeWMDDLCSR(pando::Array<char>
     pando::Place place = pando::Place{pando::NodeIndex{static_cast<std::int16_t>(i % hosts)},
                                       pando::anyPod, pando::anyCore};
     PANDO_CHECK(pando::executeOn(place, &galois::loadWMDFilePerThread, wgh, filename, 1, numThreads,
-                                 i, localEdges, perThreadRename, localVertices, totVerts));
+                                 i, localEdges, perThreadRename, localReadVertices, totVerts));
   }
 
   pando::GlobalPtr<pando::Array<galois::Pair<std::uint64_t, std::uint64_t>>> labeledEdgeCounts;
@@ -103,13 +104,14 @@ galois::DistLocalCSR<VertexType, EdgeType> initializeWMDDLCSR(pando::Array<char>
 
 #ifdef FREE
   auto freePerThreadRename =
-      +[](galois::DistArray<galois::HashTable<std::uint64_t, std::uint64_t>> perThreadRename) {
+      +[](galois::ThreadLocalStorage<galois::HashTable<std::uint64_t, std::uint64_t>>
+              perThreadRename) {
         for (galois::HashTable<std::uint64_t, std::uint64_t> hash : perThreadRename) {
           hash.deinitialize();
         }
-        perThreadRename.deinitialize();
       };
   PANDO_CHECK(pando::executeOn(pando::anyPlace, freePerThreadRename, perThreadRename));
+  perThreadRename.deinitialize();
 #endif
 
   PANDO_CHECK(
@@ -130,7 +132,7 @@ galois::DistLocalCSR<VertexType, EdgeType> initializeWMDDLCSR(pando::Array<char>
 
   /** Generate Vertex Partition **/
   galois::HostIndexedMap<pando::Vector<WMDVertex>> pHV =
-      internal::partitionVerticesParallel(std::move(localVertices), v2PM);
+      internal::partitionVerticesParallel(std::move(localReadVertices), v2PM);
 
   /** Generate Edge Partition **/
   auto [partEdges, renamePerHost] =
diff --git a/include/pando-lib-galois/import/wmd_graph_importer.hpp b/include/pando-lib-galois/import/wmd_graph_importer.hpp
index d6078687..e437960b 100644
--- a/include/pando-lib-galois/import/wmd_graph_importer.hpp
+++ b/include/pando-lib-galois/import/wmd_graph_importer.hpp
@@ -19,6 +19,8 @@
 #include <pando-lib-galois/containers/hashtable.hpp>
 #include <pando-lib-galois/containers/host_indexed_map.hpp>
 #include <pando-lib-galois/containers/per_thread.hpp>
+#include <pando-lib-galois/containers/thread_local_storage.hpp>
+#include <pando-lib-galois/containers/thread_local_vector.hpp>
 #include <pando-lib-galois/graphs/wmd_graph.hpp>
 #include <pando-lib-galois/import/ifstream.hpp>
 #include <pando-lib-galois/import/schema.hpp>
@@ -125,6 +127,7 @@ void buildEdgeCountToSend(
       }));
   PANDO_CHECK(wg.wait());
   labeledEdgeCounts = sumArray;
+  wg.deinitialize();
 }
 
 [[nodiscard]] pando::Expected<
@@ -255,7 +258,7 @@ partitionEdgesParallely(HostIndexedMap<pando::Vector<VertexType>> partitionedVer
                       galois::internal::scan_op<SRC_Val, DST_Val>,
                       galois::internal::combiner<DST_Val>, galois::Array>
         prefixSum(arr, prefixArr);
-    PANDO_CHECK(prefixSum.initialize());
+    PANDO_CHECK(prefixSum.initialize(pando::getPlaceDims().core.x * pando::getPlaceDims().core.y));
     prefixSum.computePrefixSum(lift(localEdges, size));
   }
   HostIndexedMap<pando::Vector<pando::Vector<EdgeType>>> pHVEdge{};
@@ -368,11 +371,11 @@ template <typename EdgeType>
 
 template <typename VertexType>
 [[nodiscard]] galois::HostIndexedMap<pando::Vector<VertexType>> partitionVerticesParallel(
-    galois::PerThreadVector<VertexType> localVertices, pando::Array<std::uint64_t> v2PM) {
+    galois::ThreadLocalVector<VertexType>&& localReadVertices, pando::Array<std::uint64_t> v2PM) {
   DistArray<HostIndexedMap<pando::Vector<VertexType>>> perThreadVerticesPartition;
-  PANDO_CHECK(perThreadVerticesPartition.initialize(localVertices.size()));
+  PANDO_CHECK(perThreadVerticesPartition.initialize(localReadVertices.size()));
   std::uint64_t numHosts = static_cast<std::uint64_t>(pando::getPlaceDims().node.id);
-  for (uint64_t i = 0; i < localVertices.size(); i++) {
+  for (uint64_t i = 0; i < localReadVertices.size(); i++) {
     PANDO_CHECK(lift(perThreadVerticesPartition[i], initialize));
     HostIndexedMap<pando::Vector<VertexType>> pVec = perThreadVerticesPartition[i];
     for (uint64_t j = 0; j < numHosts; j++) {
@@ -380,19 +383,20 @@ template <typename VertexType>
     }
   }
 
+  const std::uint64_t numThreads = localReadVertices.size();
   HostIndexedMap<galois::Array<std::uint64_t>> numVerticesPerHostPerThread{};
   HostIndexedMap<galois::Array<std::uint64_t>> prefixArrPerHostPerThread{};
   PANDO_CHECK(numVerticesPerHostPerThread.initialize());
   PANDO_CHECK(prefixArrPerHostPerThread.initialize());
   for (std::uint64_t i = 0; i < numHosts; i++) {
-    PANDO_CHECK(fmap(numVerticesPerHostPerThread[i], initialize, lift(localVertices, size)));
-    PANDO_CHECK(fmap(prefixArrPerHostPerThread[i], initialize, lift(localVertices, size)));
+    PANDO_CHECK(fmap(numVerticesPerHostPerThread[i], initialize, numThreads));
+    PANDO_CHECK(fmap(prefixArrPerHostPerThread[i], initialize, numThreads));
   }
 
   auto newVec =
-      make_tpl(perThreadVerticesPartition, localVertices, v2PM, numVerticesPerHostPerThread);
+      make_tpl(perThreadVerticesPartition, localReadVertices, v2PM, numVerticesPerHostPerThread);
   galois::doAllEvenlyPartition(
-      newVec, lift(localVertices, size), +[](decltype(newVec) newVec, uint64_t tid, uint64_t) {
+      newVec, numThreads, +[](decltype(newVec) newVec, uint64_t tid, uint64_t) {
         auto [perThreadVerticesPT, localVerticesVec, v2PMap, prefixArr] = newVec;
         pando::GlobalPtr<pando::Vector<VertexType>> localVerticesPtr = localVerticesVec.get(tid);
         pando::Vector<VertexType> localVertices = *localVerticesPtr;
@@ -407,6 +411,7 @@ template <typename VertexType>
           *(arr.begin() + tid) = lift(vertVec[i], size);
         }
       });
+  localReadVertices.deinitialize();
 
   // Compute prefix sum
   using SRC = galois::Array<uint64_t>;
@@ -421,8 +426,8 @@ template <typename VertexType>
                       galois::internal::scan_op<SRC_Val, DST_Val>,
                       galois::internal::combiner<DST_Val>, galois::Array>
         prefixSum(arr, prefixArr);
-    PANDO_CHECK(prefixSum.initialize());
-    prefixSum.computePrefixSum(lift(localVertices, size));
+    PANDO_CHECK(prefixSum.initialize(pando::getPlaceDims().core.x * pando::getPlaceDims().core.y));
+    prefixSum.computePrefixSum(numThreads);
   }
 
   galois::HostIndexedMap<pando::Vector<VertexType>> pHV{};
@@ -430,12 +435,12 @@ template <typename VertexType>
 
   for (uint64_t i = 0; i < numHosts; i++) {
     galois::Array<uint64_t> prefixArr = prefixArrPerHostPerThread[i];
-    PANDO_CHECK(fmap(pHV[i], initialize, prefixArr[lift(localVertices, size) - 1]));
+    PANDO_CHECK(fmap(pHV[i], initialize, prefixArr[numThreads - 1]));
   }
 
   auto phVec = make_tpl(pHV, prefixArrPerHostPerThread, perThreadVerticesPartition);
   galois::doAllEvenlyPartition(
-      phVec, lift(localVertices, size), +[](decltype(phVec) phVec, uint64_t threadID, uint64_t) {
+      phVec, numThreads, +[](decltype(phVec) phVec, uint64_t threadID, uint64_t) {
         auto [pHV, prefixArrPerHost, PHVertex] = phVec;
         std::uint64_t numHosts = static_cast<std::uint64_t>(pando::getPlaceDims().node.id);
         for (uint64_t i = 0; i < numHosts; i++) {
@@ -610,10 +615,9 @@ void loadEdgeFilePerThread(
     pando::NotificationHandle done, galois::EdgeParser<EdgeType> parser, uint64_t segmentsPerThread,
     std::uint64_t numThreads, std::uint64_t threadID,
     galois::PerThreadVector<pando::Vector<EdgeType>> localEdges,
-    galois::DistArray<galois::HashTable<std::uint64_t, std::uint64_t>> perThreadRename) {
-  auto hartID = localEdges.getLocalVectorID();
+    galois::ThreadLocalStorage<galois::HashTable<std::uint64_t, std::uint64_t>> perThreadRename) {
   auto localEdgeVec = localEdges.getThreadVector();
-  auto hashRef = perThreadRename[hartID];
+  auto hashRef = perThreadRename.getLocalRef();
 
   auto parseLine = [&parser, &localEdgeVec, &hashRef](const char* currentLine) {
     if (currentLine[0] != parser.comment) {
diff --git a/include/pando-lib-galois/sync/simple_lock.hpp b/include/pando-lib-galois/sync/simple_lock.hpp
index ab23a554..2c9c300c 100644
--- a/include/pando-lib-galois/sync/simple_lock.hpp
+++ b/include/pando-lib-galois/sync/simple_lock.hpp
@@ -48,16 +48,7 @@ class SimpleLock {
    * @warning one of the initialize methods must be called before use
    */
   [[nodiscard]] pando::Status initialize(pando::Place place, pando::MemoryType memoryType) {
-    // auto desiredValue = static_cast<LockState>(State::IsUnlocked);
-    // pando::atomicStore(pando::GlobalPtr<LockState>(&m_state), pando::GlobalPtr<const
-    // LockState>(&desiredValue),
-    //             std::memory_order_release);
-
-    const auto desiredValue = pando::allocateMemory<LockState>(1, place, memoryType);
-    if (!desiredValue.hasValue()) {
-      return desiredValue.error();
-    }
-    m_state = desiredValue.value();
+    m_state = PANDO_EXPECT_RETURN(pando::allocateMemory<LockState>(1, place, memoryType));
     *m_state = static_cast<LockState>(State::IsUnlocked);
     pando::atomicThreadFence(std::memory_order_release);
     return pando::Status::Success;
diff --git a/include/pando-lib-galois/utility/gptr_monad.hpp b/include/pando-lib-galois/utility/gptr_monad.hpp
index c384ba1b..e4e254d8 100644
--- a/include/pando-lib-galois/utility/gptr_monad.hpp
+++ b/include/pando-lib-galois/utility/gptr_monad.hpp
@@ -7,43 +7,51 @@
 /**
  * @brief lifts a function with no arguments to work on references
  */
-#define lift(ref, func)                                                   \
-  __extension__({                                                         \
-    typename std::pointer_traits<decltype(&ref)>::element_type tmp = ref; \
-    auto ret = tmp.func();                                                \
-    ref = tmp;                                                            \
-    ret;                                                                  \
+#define lift(ref, func)                                                               \
+  __extension__({                                                                     \
+    auto ptrComputed##__LINE__ = &(ref);                                              \
+    typename std::pointer_traits<decltype(ptrComputed##__LINE__)>::element_type tmp = \
+        *ptrComputed##__LINE__;                                                       \
+    auto ret = tmp.func();                                                            \
+    *ptrComputed##__LINE__ = tmp;                                                     \
+    ret;                                                                              \
   })
 
 /**
  * @brief lifts a function with no arguments to work on a void return type
  */
-#define liftVoid(ref, func)                                               \
-  do {                                                                    \
-    typename std::pointer_traits<decltype(&ref)>::element_type tmp = ref; \
-    tmp.func();                                                           \
-    ref = tmp;                                                            \
+#define liftVoid(ref, func)                                                           \
+  do {                                                                                \
+    auto ptrComputed##__LINE__ = &(ref);                                              \
+    typename std::pointer_traits<decltype(ptrComputed##__LINE__)>::element_type tmp = \
+        *ptrComputed##__LINE__;                                                       \
+    tmp.func();                                                                       \
+    *ptrComputed##__LINE__ = tmp;                                                     \
   } while (0)
 
 /**
  * @brief maps a function over its arguments up to work on references
  */
-#define fmap(ref, func, ...)                                              \
-  __extension__({                                                         \
-    typename std::pointer_traits<decltype(&ref)>::element_type tmp = ref; \
-    auto ret = tmp.func(__VA_ARGS__);                                     \
-    ref = tmp;                                                            \
-    ret;                                                                  \
+#define fmap(ref, func, ...)                                                          \
+  __extension__({                                                                     \
+    auto ptrComputed##__LINE__ = &(ref);                                              \
+    typename std::pointer_traits<decltype(ptrComputed##__LINE__)>::element_type tmp = \
+        *ptrComputed##__LINE__;                                                       \
+    auto ret = tmp.func(__VA_ARGS__);                                                 \
+    *ptrComputed##__LINE__ = tmp;                                                     \
+    ret;                                                                              \
   })
 
 /**
  * @brief maps a function over it's arguments to work on references and return void
  */
-#define fmapVoid(ref, func, ...)                                          \
-  do {                                                                    \
-    typename std::pointer_traits<decltype(&ref)>::element_type tmp = ref; \
-    tmp.func(__VA_ARGS__);                                                \
-    ref = tmp;                                                            \
+#define fmapVoid(ref, func, ...)                                                      \
+  do {                                                                                \
+    auto ptrComputed##__LINE__ = &(ref);                                              \
+    typename std::pointer_traits<decltype(ptrComputed##__LINE__)>::element_type tmp = \
+        *ptrComputed##__LINE__;                                                       \
+    tmp.func(__VA_ARGS__);                                                            \
+    *ptrComputed##__LINE__ = tmp;                                                     \
   } while (0)
 
 #endif // PANDO_LIB_GALOIS_UTILITY_GPTR_MONAD_HPP_
diff --git a/include/pando-lib-galois/utility/prefix_sum.hpp b/include/pando-lib-galois/utility/prefix_sum.hpp
index bedd1136..157e9662 100644
--- a/include/pando-lib-galois/utility/prefix_sum.hpp
+++ b/include/pando-lib-galois/utility/prefix_sum.hpp
@@ -99,8 +99,7 @@ class PrefixSum {
   A src;
   B dst;
 
-public:
-  uint64_t numThreads;
+private:
   using PArr = Conduit<B_Val>;
   Conduit<B_Val> paste;
   using WFLType = galois::WaterFallLock<Conduit<unsigned>>;
@@ -149,7 +148,7 @@ class PrefixSum {
     (void)ns;
     if (!wfl_id) {
       lock.template done<2>(wfl_id);
-      serial_pfxsum<B, B, B_Val, B_Val, equalizer, combiner, WFLType&, before<WFLType&>,
+      serial_pfxsum<PArr, PArr, B_Val, B_Val, equalizer, combiner, WFLType&, before<WFLType&>,
                     after<WFLType&>>(paste, paste, 0, 0, ns, lock);
     } else {
       lock.template wait<2>(wfl_id - 1);
@@ -157,7 +156,8 @@ class PrefixSum {
   }
 
   /** Does the final prefix sums with the last part of the array being handled
-   * by tid = 0 */
+   *  by tid = 0
+   */
   inline void parallel_pfxsum_phase_2(uint64_t src_offset, uint64_t dst_offset, uint64_t ns,
                                       B_Val phase1_val, bool pfxsum) {
     if (pfxsum) {
@@ -185,28 +185,19 @@ class PrefixSum {
 
 public:
   PrefixSum() = default;
-  PrefixSum(A src_, B dst_) : src(src_), dst(dst_), paste(B()), lock() {
-    uint64_t coreY = pando::getPlaceDims().core.y;
-    uint64_t cores = pando::getPlaceDims().core.x * coreY;
-    uint64_t threads = pando::getThreadDims().id;
-    uint64_t hosts = pando::getPlaceDims().node.id;
-    numThreads = hosts * cores * threads;
-  }
+  PrefixSum(A src_, B dst_) : src(src_), dst(dst_), paste(), lock() {}
 
-  [[nodiscard]] pando::Status initialize(std::uint64_t size) {
-    pando::Status err = lock.initialize(size);
+  [[nodiscard]] pando::Status initialize(std::uint64_t numWorkers) {
+    pando::Status err = lock.initialize(numWorkers);
     if (err != pando::Status::Success) {
       return err;
     }
-    err = paste.initialize(size);
+    err = paste.initialize(numWorkers);
     if (err != pando::Status::Success) {
       return err;
     }
     return pando::Status::Success;
   }
-  [[nodiscard]] pando::Status initialize() {
-    return initialize(numThreads);
-  }
 
   void deinitialize() {
     paste.deinitialize();
@@ -218,7 +209,7 @@ class PrefixSum {
    * @warning we expect ns to be less than equal to the length of source and destination
    */
   void computePrefixSum(uint64_t ns) {
-    uint64_t workers = numThreads;
+    std::uint64_t workers = paste.size();
     uint64_t workPerThread = ns / (workers + 1);
     if (workPerThread <= 10) {
       workers /= pando::getThreadDims().id;
diff --git a/pando-rt/src/atomic.cpp b/pando-rt/src/atomic.cpp
index 257e4c9f..22c9f27d 100644
--- a/pando-rt/src/atomic.cpp
+++ b/pando-rt/src/atomic.cpp
@@ -348,11 +348,11 @@ bool atomicCompareExchangeImpl(GlobalPtr<T> ptr, GlobalPtr<T> expected, GlobalPt
     if (handle.value() == expectedValue) {
       // success
       postAtomicOpFence(success);
-      *expected = expectedValue;
       return true;
     } else {
       // failure
       postAtomicOpFence(failure);
+      *expected = handle.value();
       return false;
     }
   }
diff --git a/pando-rt/test/containers/test_vector.cpp b/pando-rt/test/containers/test_vector.cpp
index fe556228..76fb1e9c 100644
--- a/pando-rt/test/containers/test_vector.cpp
+++ b/pando-rt/test/containers/test_vector.cpp
@@ -162,7 +162,7 @@ TEST(Vector, StressCreateDestroy) {
 
 TEST(Vector, StressPushBack) {
   const std::uint64_t size = 8;
-  const std::uint64_t finalSz = 1 << 10;
+  const std::uint64_t finalSz = 1 << 6;
 
   pando::Vector<std::uint64_t> vector;
   EXPECT_EQ(vector.initialize(size), pando::Status::Success);
diff --git a/scripts/mirror_master_validate.py b/scripts/mirror_master_validate.py
new file mode 100644
index 00000000..7324a6df
--- /dev/null
+++ b/scripts/mirror_master_validate.py
@@ -0,0 +1,27 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2023. University of Texas at Austin. All rights reserved.
+
+import sys
+
+def tablevalidation():
+    dic = {}
+    for line in sys.stdin:
+        parts = line.strip().split(', ')
+        if len(parts) != 3:
+            continue
+
+        operation, key, value = parts
+
+        if operation == "SET":
+            dic[key] = value
+        elif operation == "FALSE":
+            if key in dic and dic[key] == value:
+                sys.exit(1)
+        elif operation == "TRUE":
+            if key not in dic or dic[key] != value:
+                sys.exit(1)
+    return "PASS"
+
+# Call the function to process input from stdin
+result = tablevalidation()
+print(result)
diff --git a/src/ingest_rmat_el.cpp b/src/ingest_rmat_el.cpp
index be5f06f9..b5e7a989 100644
--- a/src/ingest_rmat_el.cpp
+++ b/src/ingest_rmat_el.cpp
@@ -20,15 +20,14 @@ auto generateRMATParser(
   };
 }
 
-void galois::loadELFilePerThread(galois::WaitGroup::HandleType wgh, pando::Array<char> filename,
-                                 std::uint64_t segmentsPerThread, std::uint64_t numThreads,
-                                 std::uint64_t threadID,
-                                 galois::PerThreadVector<pando::Vector<ELEdge>> localEdges,
-                                 DistArray<HashTable<std::uint64_t, std::uint64_t>> perThreadRename,
-                                 std::uint64_t numVertices) {
-  auto hartID = localEdges.getLocalVectorID();
+void galois::loadELFilePerThread(
+    galois::WaitGroup::HandleType wgh, pando::Array<char> filename, std::uint64_t segmentsPerThread,
+    std::uint64_t numThreads, std::uint64_t threadID,
+    galois::PerThreadVector<pando::Vector<ELEdge>> localEdges,
+    ThreadLocalStorage<HashTable<std::uint64_t, std::uint64_t>> perThreadRename,
+    std::uint64_t numVertices) {
   auto parser =
-      generateRMATParser(&localEdges.getThreadVector(), &perThreadRename[hartID], numVertices);
+      generateRMATParser(&localEdges.getThreadVector(), perThreadRename.getLocal(), numVertices);
   PANDO_CHECK(
       internal::loadGraphFilePerThread(filename, segmentsPerThread, numThreads, threadID, parser));
   wgh.done();
diff --git a/src/ingest_wmd_csv.cpp b/src/ingest_wmd_csv.cpp
index b561a198..524888fe 100644
--- a/src/ingest_wmd_csv.cpp
+++ b/src/ingest_wmd_csv.cpp
@@ -7,13 +7,13 @@ auto generateWMDParser(
     pando::Array<galois::StringView> tokens,
     pando::GlobalPtr<pando::Vector<pando::Vector<galois::WMDEdge>>> localEdges,
     pando::GlobalPtr<galois::HashTable<std::uint64_t, std::uint64_t>> localRename,
-    pando::GlobalPtr<pando::Vector<galois::WMDVertex>> localVertices, uint64_t* totVerts) {
+    pando::GlobalPtr<pando::Vector<galois::WMDVertex>> localReadVertices, uint64_t* totVerts) {
   using galois::WMDEdge, galois::WMDVertex;
   using galois::internal::insertLocalEdgesPerThread;
-  return [localEdges, localRename, localVertices, totVerts, tokens](char* line) {
-    auto vfunc = [localVertices, totVerts](WMDVertex v) {
+  return [localEdges, localRename, localReadVertices, totVerts, tokens](char* line) {
+    auto vfunc = [localReadVertices, totVerts](WMDVertex v) {
       *totVerts += 1;
-      return fmap(*localVertices, pushBack, v);
+      return fmap(*localReadVertices, pushBack, v);
     };
     auto efunc = [localEdges, localRename](WMDEdge e, agile::TYPES inverseEdgeType) {
       WMDEdge inverseE = e;
@@ -32,15 +32,14 @@ void galois::loadWMDFilePerThread(
     galois::WaitGroup::HandleType wgh, pando::Array<char> filename, std::uint64_t segmentsPerThread,
     std::uint64_t numThreads, std::uint64_t threadID,
     galois::PerThreadVector<pando::Vector<WMDEdge>> localEdges,
-    galois::DistArray<galois::HashTable<std::uint64_t, std::uint64_t>> perThreadRename,
-    galois::PerThreadVector<WMDVertex> localVertices,
+    galois::ThreadLocalStorage<galois::HashTable<std::uint64_t, std::uint64_t>> perThreadRename,
+    galois::ThreadLocalVector<WMDVertex> localReadVertices,
     galois::DAccumulator<std::uint64_t> totVerts) {
   std::uint64_t countLocalVertices = 0;
   pando::Array<galois::StringView> tokens;
   PANDO_CHECK(tokens.initialize(10));
-  auto hartID = localVertices.getLocalVectorID();
-  auto parser = generateWMDParser(tokens, &localEdges.getThreadVector(), &perThreadRename[hartID],
-                                  &localVertices.getThreadVector(), &countLocalVertices);
+  auto parser = generateWMDParser(tokens, &localEdges.getThreadVector(), perThreadRename.getLocal(),
+                                  localReadVertices.getLocal(), &countLocalVertices);
   PANDO_CHECK(
       internal::loadGraphFilePerThread(filename, segmentsPerThread, numThreads, threadID, parser));
 
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 41e5f080..9db3f07d 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -28,11 +28,16 @@ add_subdirectory(utility)
 pando_add_exec(import_csrlist import_csrlist.cpp)
 pando_add_exec(import_dirOptCsrList import_dirOptCsrList.cpp)
 pando_add_exec(import_ifstream import_ifstream.cpp)
+pando_add_exec(mirror_master_table test_mirror_master_table.cpp)
 
 pando_add_bin_test(import_dirOptCsrList "-n 10 -f" ${pando-lib-galois_SOURCE_DIR}/graphs/repeats.el
   ${pando-lib-galois_SOURCE_DIR}/ok/repeats.el-import-dirOptCsrList-10.ok)
+pando_add_bin_python_test(mirror_master_table "-n 10 -f" ${pando-lib-galois_SOURCE_DIR}/graphs/repeats.el)
+pando_add_bin_python_test(mirror_master_table "-n 10 -f" ${pando-lib-galois_SOURCE_DIR}/graphs/simple.el)
+
 
 if (NOT PANDO_RT_BACKEND STREQUAL "DRVX") # for speed reasons
+pando_add_bin_python_test(mirror_master_table  "-n 1024 -f" ${pando-lib-galois_SOURCE_DIR}/graphs/rmat_571919_seed1_scale10_nV1024_nE10447.el)
 
 pando_add_bin_test(import_dirOptCsrList "-n 1024 -f" ${pando-lib-galois_SOURCE_DIR}/graphs/rmat_571919_seed1_scale10_nV1024_nE10447.el
   ${pando-lib-galois_SOURCE_DIR}/ok/rmat_571919_seed1_scale10_nV1024_nE10447.el-import-dirOptCsrList-1024.ok)
diff --git a/test/containers/CMakeLists.txt b/test/containers/CMakeLists.txt
index bb6da64d..a2f6182d 100644
--- a/test/containers/CMakeLists.txt
+++ b/test/containers/CMakeLists.txt
@@ -8,3 +8,5 @@ pando_add_driver_test(test_stack test_stack.cpp)
 pando_add_driver_test(test_host_indexed_map test_host_indexed_map.cpp)
 pando_add_driver_test(test_host_local_storage test_host_local_storage.cpp)
 pando_add_driver_test(test_thread_local_storage test_thread_local_storage.cpp)
+pando_add_driver_test(test_thread_local_vector test_thread_local_vector.cpp)
+pando_add_driver_test(test_host_cached_array test_host_cached_array.cpp)
diff --git a/test/containers/test_host_cached_array.cpp b/test/containers/test_host_cached_array.cpp
new file mode 100644
index 00000000..1b90b357
--- /dev/null
+++ b/test/containers/test_host_cached_array.cpp
@@ -0,0 +1,277 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023. University of Texas at Austin. All rights reserved.
+
+#include <gtest/gtest.h>
+#include <pando-rt/export.h>
+
+#include <algorithm>
+
+#include <pando-lib-galois/containers/host_cached_array.hpp>
+#include <pando-lib-galois/utility/pair.hpp>
+#include <pando-rt/containers/vector.hpp>
+#include <pando-rt/pando-rt.hpp>
+
+TEST(HostCachedArray, Empty) {
+  galois::HostCachedArray<std::uint64_t> array;
+  pando::Array<std::uint64_t> sizes;
+  EXPECT_EQ(sizes.initialize(pando::getPlaceDims().node.id), pando::Status::Success);
+  for (auto ref : sizes) {
+    ref = 0;
+  }
+  EXPECT_EQ(array.initialize(sizes), pando::Status::Success);
+  EXPECT_EQ(array.size(), 0);
+  EXPECT_TRUE(array.empty());
+  array.deinitialize();
+  sizes.deinitialize();
+}
+
+TEST(HostCachedArray, ExecuteOn) {
+  constexpr std::uint64_t goodVal = 0xDEADBEEF;
+
+  constexpr std::uint64_t size = 5;
+  const std::uint64_t nodes = pando::getPlaceDims().node.id;
+
+  pando::Array<std::uint64_t> sizes;
+  EXPECT_EQ(sizes.initialize(nodes), pando::Status::Success);
+  for (auto ref : sizes) {
+    ref = size;
+  }
+
+  // create array
+  galois::HostCachedArray<std::uint64_t> array;
+
+  EXPECT_EQ(array.initialize(sizes), pando::Status::Success);
+
+  for (std::uint64_t i = 0; i < size * nodes; i++) {
+    array[i] = 0xDEADBEEF;
+  }
+
+  pando::Status status;
+  auto func = +[](pando::NotificationHandle done, std::uint64_t goodVal,
+                  galois::HostCachedArray<std::uint64_t> hca) {
+    for (auto curr : hca) {
+      EXPECT_EQ(curr, goodVal);
+    }
+    done.notify();
+  };
+  pando::Notification notif;
+  EXPECT_EQ(notif.init(), pando::Status::Success);
+  status = pando::executeOn(pando::Place{pando::NodeIndex{0}, pando::anyPod, pando::anyCore}, func,
+                            notif.getHandle(), goodVal, array);
+  EXPECT_EQ(status, pando::Status::Success);
+  notif.wait();
+
+  array.deinitialize();
+  sizes.deinitialize();
+}
+
+TEST(HostCachedArray, Initialize) {
+  constexpr std::uint64_t size = 10;
+
+  const std::uint64_t nodes = pando::getPlaceDims().node.id;
+  pando::Array<std::uint64_t> sizes;
+  EXPECT_EQ(sizes.initialize(nodes), pando::Status::Success);
+  for (auto ref : sizes) {
+    ref = size;
+  }
+
+  galois::HostCachedArray<std::uint64_t> array;
+  EXPECT_EQ(array.initialize(sizes), pando::Status::Success);
+  EXPECT_EQ(array.size(), size * nodes);
+
+  for (std::uint64_t i = 0; i < size * nodes; i++) {
+    std::int16_t nodeIdx = i / size;
+    EXPECT_EQ(pando::localityOf(&array[i]).node.id, nodeIdx);
+    array[i] = i;
+  }
+
+  for (std::uint64_t i = 0; i < size * nodes; i++) {
+    EXPECT_EQ(array[i], i);
+  }
+
+  array.deinitialize();
+  sizes.deinitialize();
+}
+
+TEST(HostCachedArray, Swap) {
+  const std::uint64_t size0 = 10;
+  const std::uint64_t size1 = 16;
+  const std::uint64_t nodes = pando::getPlaceDims().node.id;
+  pando::Array<std::uint64_t> sizes0;
+  pando::Array<std::uint64_t> sizes1;
+
+  EXPECT_EQ(sizes0.initialize(nodes), pando::Status::Success);
+  for (auto ref : sizes0) {
+    ref = size0;
+  }
+
+  EXPECT_EQ(sizes1.initialize(nodes), pando::Status::Success);
+  for (auto ref : sizes1) {
+    ref = size1;
+  }
+
+  galois::HostCachedArray<std::uint64_t> array0;
+  EXPECT_EQ(array0.initialize(sizes0), pando::Status::Success);
+  for (std::uint64_t i = 0; i < size0 * nodes; i++) {
+    array0[i] = i;
+  }
+
+  galois::HostCachedArray<std::uint64_t> array1;
+  EXPECT_EQ(array1.initialize(sizes1), pando::Status::Success);
+  for (std::uint64_t i = 0; i < size1 * nodes; i++) {
+    array1[i] = i + (size0 * nodes);
+  }
+
+  std::swap(array0, array1);
+
+  for (std::uint64_t i = 0; i < size1 * nodes; i++) {
+    EXPECT_EQ(array0[i], i + (size0 * nodes));
+  }
+
+  for (std::uint64_t i = 0; i < size0 * nodes; i++) {
+    EXPECT_EQ(array1[i], i);
+  }
+
+  sizes0.deinitialize();
+  sizes1.deinitialize();
+  array0.deinitialize();
+  array1.deinitialize();
+}
+
+TEST(HostCachedArray, Iterator) {
+  const std::uint64_t size = 25;
+
+  // create array
+  galois::HostCachedArray<std::uint64_t> array;
+
+  const std::uint64_t nodes = pando::getPlaceDims().node.id;
+  pando::Array<std::uint64_t> sizes;
+  EXPECT_EQ(sizes.initialize(nodes), pando::Status::Success);
+  for (auto ref : sizes) {
+    ref = size;
+  }
+
+  EXPECT_EQ(array.initialize(sizes), pando::Status::Success);
+
+  for (std::uint64_t i = 0; i < size * nodes; i++) {
+    array[i] = i;
+  }
+  for (std::uint64_t i = 0; i < size; i++) {
+    EXPECT_EQ(array[i], i);
+  }
+
+  std::uint64_t i = 0;
+  for (std::uint64_t val : array) {
+    EXPECT_EQ(val, i);
+    i++;
+  }
+
+  array.deinitialize();
+  sizes.deinitialize();
+}
+
+TEST(HostCachedArray, IteratorManual) {
+  const std::uint64_t size = 25;
+
+  // create array
+  galois::HostCachedArray<std::uint64_t> array;
+
+  const std::uint64_t nodes = pando::getPlaceDims().node.id;
+  pando::Array<std::uint64_t> sizes;
+  EXPECT_EQ(sizes.initialize(nodes), pando::Status::Success);
+  for (auto ref : sizes) {
+    ref = size;
+  }
+
+  EXPECT_EQ(array.initialize(sizes), pando::Status::Success);
+
+  for (std::uint64_t i = 0; i < size * nodes; i++) {
+    array[i] = i;
+  }
+
+  for (std::uint64_t i = 0; i < size * nodes; i++) {
+    EXPECT_EQ(array[i], i);
+  }
+
+  std::uint64_t i = 0;
+  for (auto curr = array.begin(); curr != array.end(); curr++) {
+    EXPECT_EQ(*curr, i);
+    i++;
+  }
+
+  array.deinitialize();
+  sizes.deinitialize();
+}
+
+TEST(HostCachedArray, ReverseIterator) {
+  const std::uint64_t size = 25;
+
+  // create array
+  galois::HostCachedArray<std::uint64_t> array;
+
+  const std::uint64_t nodes = pando::getPlaceDims().node.id;
+  pando::Array<std::uint64_t> sizes;
+  EXPECT_EQ(sizes.initialize(nodes), pando::Status::Success);
+  for (auto ref : sizes) {
+    ref = size;
+  }
+
+  EXPECT_EQ(array.initialize(sizes), pando::Status::Success);
+
+  for (std::uint64_t i = 0; i < size * nodes; i++) {
+    array[i] = i;
+  }
+
+  for (std::uint64_t i = 0; i < size * nodes; i++) {
+    EXPECT_EQ(array[i], i);
+  }
+
+  std::uint64_t i = array.size() - 1;
+  for (auto curr = array.rbegin(); curr != array.rend(); curr++) {
+    EXPECT_EQ(*curr, i);
+    i--;
+  }
+
+  array.deinitialize();
+  sizes.deinitialize();
+}
+
+TEST(HostCachedArray, IteratorExecuteOn) {
+  using It = galois::HostCachedArrayIterator<std::uint64_t>;
+  constexpr std::uint64_t goodVal = 0xDEADBEEF;
+
+  constexpr std::uint64_t size = 5;
+  const std::uint64_t nodes = pando::getPlaceDims().node.id;
+
+  pando::Array<std::uint64_t> sizes;
+  EXPECT_EQ(sizes.initialize(nodes), pando::Status::Success);
+  for (auto ref : sizes) {
+    ref = size;
+  }
+
+  // create array
+  galois::HostCachedArray<std::uint64_t> array;
+
+  EXPECT_EQ(array.initialize(sizes), pando::Status::Success);
+
+  for (std::uint64_t i = 0; i < size * nodes; i++) {
+    array[i] = 0xDEADBEEF;
+  }
+
+  pando::Status status;
+  auto func = +[](pando::NotificationHandle done, std::uint64_t goodVal, It begin, It end) {
+    for (auto curr = begin; curr != end; curr++) {
+      EXPECT_EQ(*curr, goodVal);
+    }
+    done.notify();
+  };
+  pando::Notification notif;
+  EXPECT_EQ(notif.init(), pando::Status::Success);
+  status = pando::executeOn(pando::Place{pando::NodeIndex{0}, pando::anyPod, pando::anyCore}, func,
+                            notif.getHandle(), goodVal, array.begin(), array.end());
+  EXPECT_EQ(status, pando::Status::Success);
+  notif.wait();
+
+  array.deinitialize();
+  sizes.deinitialize();
+}
diff --git a/test/containers/test_per_thread.cpp b/test/containers/test_per_thread.cpp
index bf34bac3..c25e31b1 100644
--- a/test/containers/test_per_thread.cpp
+++ b/test/containers/test_per_thread.cpp
@@ -134,7 +134,7 @@ TEST(PerThreadVector, DoAll) {
   EXPECT_EQ(perThreadVec.initialize(), pando::Status::Success);
   *perThreadVecPtr = perThreadVec;
 
-  static const uint64_t workItems = 1000;
+  static const uint64_t workItems = 100;
   galois::DistArray<uint64_t> work;
   EXPECT_EQ(work.initialize(workItems), pando::Status::Success);
   for (uint64_t i = 0; i < workItems; i++) {
@@ -304,8 +304,8 @@ TEST(PerThreadVector, Clear) {
             });
       });
 
-  galois::DAccumulator<std::uint64_t> accum;
-  err = lift(accum, initialize);
+  galois::DAccumulator<std::uint64_t> accum{};
+  err = accum.initialize();
   EXPECT_EQ(err, pando::Status::Success);
 
   err = galois::doAll(
@@ -329,13 +329,10 @@ TEST(PerThreadVector, Clear) {
 }
 
 TEST(PerThreadVector, ClearCompute) {
-  pando::GlobalPtr<galois::PerThreadVector<uint64_t>> perThreadVecPtr =
-      getGlobalObject<galois::PerThreadVector<uint64_t>>();
   galois::PerThreadVector<uint64_t> perThreadVec;
   EXPECT_EQ(perThreadVec.initialize(), pando::Status::Success);
-  *perThreadVecPtr = perThreadVec;
 
-  static uint64_t workItems = 1000;
+  static uint64_t workItems = 100;
   galois::DistArray<uint64_t> work;
   EXPECT_EQ(work.initialize(workItems), pando::Status::Success);
   for (uint64_t i = 0; i < workItems; i++) {
@@ -463,15 +460,15 @@ TEST(PerThreadVector, ClearCompute) {
 TEST(Vector, IntVectorOfVectorsUniform) {
   pando::Vector<pando::Vector<std::uint64_t>> vec;
   EXPECT_EQ(vec.initialize(0), pando::Status::Success);
-  uint64_t size = 2000;
-  galois::HashTable<uint64_t, uint64_t> table;
+  uint64_t size = 10;
+  galois::HashTable<uint64_t, uint64_t> table{};
   PANDO_CHECK(table.initialize(8));
   uint64_t result = 0;
 
   // Creates a vector of vectors of size [i,1]
   for (uint64_t i = 0; i < size; i++) {
-    EXPECT_FALSE(fmap(table, get, i, result));
-    PANDO_CHECK(fmap(table, put, i, lift(vec, size)));
+    EXPECT_FALSE(table.get(i, result));
+    PANDO_CHECK(table.put(i, lift(vec, size)));
     pando::Vector<std::uint64_t> v;
     EXPECT_EQ(v.initialize(1), pando::Status::Success);
     v[0] = i;
@@ -480,8 +477,8 @@ TEST(Vector, IntVectorOfVectorsUniform) {
 
   // Pushes back i+i to each vector
   for (uint64_t i = 0; i < size; i++) {
-    EXPECT_TRUE(fmap(table, get, i, result));
-    pando::GlobalRef<pando::Vector<uint64_t>> vec1 = fmap(vec, get, result);
+    EXPECT_TRUE(table.get(i, result));
+    pando::GlobalRef<pando::Vector<uint64_t>> vec1 = vec.get(result);
     pando::Vector<uint64_t> vec2 = vec1;
     EXPECT_EQ(vec2.get(0), i);
     EXPECT_EQ(fmap(vec1, pushBack, (i + i)), pando::Status::Success);
@@ -494,8 +491,9 @@ TEST(Vector, IntVectorOfVectorsUniform) {
     EXPECT_EQ(vec2[1], i + i);
     EXPECT_EQ(vec2[0], i);
     EXPECT_EQ(vec2.size(), 2);
-    EXPECT_TRUE(fmap(table, get, i, result));
+    EXPECT_TRUE(table.get(i, result));
     EXPECT_EQ(result, i);
+    vec2.deinitialize();
   }
   EXPECT_EQ(vec.size(), size);
   vec.deinitialize();
@@ -504,8 +502,8 @@ TEST(Vector, IntVectorOfVectorsUniform) {
 TEST(Vector, IntVectorOfVectorsRandom) {
   pando::Vector<pando::Vector<std::uint64_t>> vec;
   EXPECT_EQ(vec.initialize(0), pando::Status::Success);
-  uint64_t size = 2000;
-  galois::HashTable<uint64_t, uint64_t> table;
+  uint64_t size = 10;
+  galois::HashTable<uint64_t, uint64_t> table{};
   PANDO_CHECK(table.initialize(8));
   uint64_t result = 0;
   std::random_device rd;
@@ -524,12 +522,12 @@ TEST(Vector, IntVectorOfVectorsRandom) {
     } else {
       map[src].push_back(dst);
     }
-    if (fmap(table, get, src, result)) {
-      pando::GlobalRef<pando::Vector<uint64_t>> vec1 = fmap(vec, get, result);
+    if (table.get(src, result)) {
+      pando::GlobalRef<pando::Vector<uint64_t>> vec1 = vec.get(result);
       pando::Vector<uint64_t> vec2 = vec1;
       EXPECT_EQ(fmap(vec1, pushBack, dst), pando::Status::Success);
     } else {
-      PANDO_CHECK(fmap(table, put, src, lift(vec, size)));
+      PANDO_CHECK(table.put(src, lift(vec, size)));
       pando::Vector<std::uint64_t> v;
       EXPECT_EQ(v.initialize(1), pando::Status::Success);
       v[0] = dst;
@@ -539,8 +537,8 @@ TEST(Vector, IntVectorOfVectorsRandom) {
 
   // Validates the vectors
   for (auto it = map.begin(); it != map.end(); ++it) {
-    EXPECT_TRUE(fmap(table, get, it->first, result));
-    pando::GlobalRef<pando::Vector<uint64_t>> vec1 = fmap(vec, get, result);
+    EXPECT_TRUE(table.get(it->first, result));
+    pando::GlobalRef<pando::Vector<uint64_t>> vec1 = vec.get(result);
     pando::Vector<uint64_t> vec2 = vec1;
     std::sort(vec2.begin(), vec2.end());
     std::vector<uint64_t> v = it->second;
@@ -557,8 +555,8 @@ TEST(Vector, IntVectorOfVectorsRandom) {
 TEST(Vector, EdgelistVectorOfVectors) {
   pando::Vector<pando::Vector<galois::WMDEdge>> vec;
   EXPECT_EQ(vec.initialize(0), pando::Status::Success);
-  uint64_t size = 2000;
-  galois::HashTable<uint64_t, uint64_t> table;
+  uint64_t size = 10;
+  galois::HashTable<uint64_t, uint64_t> table{};
   PANDO_CHECK(table.initialize(8));
   uint64_t result = 0;
   std::random_device rd;
@@ -578,13 +576,13 @@ TEST(Vector, EdgelistVectorOfVectors) {
       map[src].push_back(dst);
     }
 
-    if (fmap(table, get, src, result)) {
-      pando::GlobalRef<pando::Vector<galois::WMDEdge>> vec1 = fmap(vec, get, result);
+    if (table.get(src, result)) {
+      pando::GlobalRef<pando::Vector<galois::WMDEdge>> vec1 = vec.get(result);
       pando::Vector<galois::WMDEdge> vec2 = vec1;
       galois::WMDEdge edge(src, dst, agile::TYPES::NONE, agile::TYPES::NONE, agile::TYPES::NONE);
       EXPECT_EQ(fmap(vec1, pushBack, edge), pando::Status::Success);
     } else {
-      PANDO_CHECK(fmap(table, put, src, lift(vec, size)));
+      PANDO_CHECK(table.put(src, lift(vec, size)));
       pando::Vector<galois::WMDEdge> v;
       EXPECT_EQ(v.initialize(1), pando::Status::Success);
       galois::WMDEdge edge(src, dst, agile::TYPES::NONE, agile::TYPES::NONE, agile::TYPES::NONE);
@@ -595,11 +593,11 @@ TEST(Vector, EdgelistVectorOfVectors) {
 
   // Validates the vectors
   for (auto it = map.begin(); it != map.end(); ++it) {
-    EXPECT_TRUE(fmap(table, get, it->first, result));
-    pando::GlobalRef<pando::Vector<galois::WMDEdge>> vec1 = fmap(vec, get, result);
+    EXPECT_TRUE(table.get(it->first, result));
+    pando::GlobalRef<pando::Vector<galois::WMDEdge>> vec1 = vec.get(result);
     pando::Vector<galois::WMDEdge> vec2 = vec1;
     std::vector<uint64_t> v = it->second;
-    EXPECT_EQ(lift(vec2, size), v.size());
+    EXPECT_EQ(vec2.size(), v.size());
     for (uint64_t k = 0; k < lift(vec2, size); k++) {
       galois::WMDEdge edge = vec2[k];
       bool found = false;
diff --git a/test/containers/test_thread_local_storage.cpp b/test/containers/test_thread_local_storage.cpp
index caae41d4..ef0c5408 100644
--- a/test/containers/test_thread_local_storage.cpp
+++ b/test/containers/test_thread_local_storage.cpp
@@ -108,9 +108,9 @@ TEST(ThreadLocalStorage, DoAll) {
 }
 
 TEST(ThreadLocalStorage, copyToAllThreads) {
-  const std::uint64_t SIZE = 100;
+  const std::uint64_t SIZE = 10;
   pando::Array<std::uint64_t> arr;
-  EXPECT_EQ(pando::Status::Success, arr.initialize(100));
+  EXPECT_EQ(pando::Status::Success, arr.initialize(SIZE));
   for (std::uint64_t i = 0; i < SIZE; i++) {
     arr[i] = i;
   }
diff --git a/test/containers/test_thread_local_vector.cpp b/test/containers/test_thread_local_vector.cpp
new file mode 100644
index 00000000..cef9ece5
--- /dev/null
+++ b/test/containers/test_thread_local_vector.cpp
@@ -0,0 +1,402 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023. University of Texas at Austin. All rights reserved.
+
+#include <gtest/gtest.h>
+#include <random>
+
+#include "pando-rt/export.h"
+#include <pando-lib-galois/containers/hashtable.hpp>
+#include <pando-lib-galois/containers/thread_local_vector.hpp>
+#include <pando-lib-galois/graphs/wmd_graph.hpp>
+#include <pando-lib-galois/loops/do_all.hpp>
+#include <pando-lib-galois/utility/dist_accumulator.hpp>
+#include <pando-rt/containers/vector.hpp>
+#include <pando-rt/memory/global_ptr.hpp>
+#include <pando-rt/memory/memory_guard.hpp>
+#include <pando-rt/pando-rt.hpp>
+#include <pando-rt/sync/notification.hpp>
+
+namespace {
+
+template <typename T>
+pando::GlobalPtr<T> getGlobalObject() {
+  const auto expected =
+      pando::allocateMemory<T>(1, pando::getCurrentPlace(), pando::MemoryType::Main);
+  EXPECT_EQ(expected.hasValue(), true);
+  return expected.value();
+}
+
+/**
+uint64_t getHostThreads() {
+  uint64_t x = pando::getPlaceDims().core.x;
+  uint64_t y = pando::getPlaceDims().core.y;
+  uint64_t threads = pando::getThreadDims().id;
+  return x * y * threads;
+}
+*/
+
+struct State {
+  State() = default;
+  State(galois::WaitGroup::HandleType f, galois::DAccumulator<uint64_t> s) : first(f), second(s) {}
+
+  galois::WaitGroup::HandleType first;
+  galois::DAccumulator<uint64_t> second;
+};
+
+} // namespace
+
+TEST(ThreadLocalVector, Init) {
+  galois::ThreadLocalVector<uint64_t> perThreadVec{};
+  EXPECT_EQ(perThreadVec.initialize(), pando::Status::Success);
+  pando::Vector<uint64_t> work;
+  EXPECT_EQ(work.initialize(1), pando::Status::Success);
+  work[0] = 9801;
+  galois::doAll(
+      perThreadVec, work, +[](galois::ThreadLocalVector<uint64_t> perThreadVec, uint64_t x) {
+        EXPECT_GE(pando::getCurrentThread().id, 0);
+        EXPECT_EQ(perThreadVec.pushBack(x), pando::Status::Success);
+        pando::Vector<uint64_t> localVec = perThreadVec.getLocalRef();
+        EXPECT_EQ(localVec.size(), 1);
+      });
+  EXPECT_EQ(perThreadVec.sizeAll(), 1);
+
+  std::uint64_t elts = 0;
+  for (pando::Vector<uint64_t> vec : perThreadVec) {
+    elts += vec.size();
+  }
+  EXPECT_EQ(elts, 1);
+
+  auto hca = PANDO_EXPECT_CHECK(perThreadVec.hostCachedFlatten());
+  EXPECT_EQ(hca.size(), 1);
+  uint64_t val = hca[0];
+  EXPECT_EQ(val, 9801);
+
+  hca.deinitialize();
+  work.deinitialize();
+  perThreadVec.deinitialize();
+}
+
+TEST(ThreadLocalVector, Parallel) {
+  galois::ThreadLocalVector<uint64_t> perThreadVec{};
+  EXPECT_EQ(perThreadVec.initialize(), pando::Status::Success);
+
+  static const uint64_t workItems = 1000;
+  pando::Vector<uint64_t> work;
+  EXPECT_EQ(work.initialize(workItems), pando::Status::Success);
+  galois::doAll(
+      perThreadVec, work, +[](galois::ThreadLocalVector<uint64_t>& perThreadVec, uint64_t x) {
+        uint64_t originalID = pando::getCurrentThread().id;
+        EXPECT_GE(originalID, 0);
+        EXPECT_LT(originalID, pando::getThreadDims().id);
+        pando::Vector<uint64_t> staleVec = perThreadVec.getLocalRef();
+
+        EXPECT_EQ(perThreadVec.pushBack(x), pando::Status::Success);
+
+        pando::Vector<uint64_t> localVec = perThreadVec.getLocalRef();
+        EXPECT_GT(localVec.size(), 0);
+        EXPECT_LT(localVec.size(), workItems);
+        EXPECT_EQ(localVec.size(), staleVec.size() + 1);
+      });
+  EXPECT_EQ(perThreadVec.sizeAll(), workItems);
+
+  uint64_t elts = 0;
+  for (uint64_t i = 0; i < perThreadVec.size(); i++) {
+    pando::Vector<uint64_t> vec = perThreadVec[i];
+    elts += vec.size();
+    for (uint64_t i = 0; i < vec.size(); i++) {
+      EXPECT_LT(vec[i], workItems);
+    }
+  }
+  EXPECT_EQ(elts, workItems);
+  EXPECT_EQ(perThreadVec.sizeAll(), workItems);
+
+  galois::HostCachedArray<uint64_t> hca = PANDO_EXPECT_CHECK(perThreadVec.hostCachedFlatten());
+  EXPECT_EQ(hca.size(), workItems);
+
+  hca.deinitialize();
+  work.deinitialize();
+  perThreadVec.deinitialize();
+}
+
+TEST(ThreadLocalVector, DoAll) {
+  galois::ThreadLocalVector<uint64_t> perThreadVec;
+  EXPECT_EQ(perThreadVec.initialize(), pando::Status::Success);
+
+  static const uint64_t workItems = 100;
+  galois::DistArray<uint64_t> work;
+  EXPECT_EQ(work.initialize(workItems), pando::Status::Success);
+  for (uint64_t i = 0; i < workItems; i++) {
+    work[i] = i;
+  }
+
+  galois::DAccumulator<uint64_t> sum{};
+  EXPECT_EQ(sum.initialize(), pando::Status::Success);
+  EXPECT_EQ(sum.get(), 0);
+
+  galois::doAll(
+      perThreadVec, work, +[](galois::ThreadLocalVector<uint64_t>& perThreadVec, uint64_t x) {
+        uint64_t originalID = pando::getCurrentThread().id;
+        EXPECT_GE(originalID, 0);
+        EXPECT_LT(originalID, pando::getThreadDims().id);
+        pando::Vector<uint64_t> staleVec = perThreadVec.getLocalRef();
+
+        EXPECT_EQ(perThreadVec.pushBack(x), pando::Status::Success);
+
+        pando::Vector<uint64_t> localVec = perThreadVec.getLocalRef();
+        EXPECT_EQ(pando::localityOf(localVec.data()).node.id, pando::getCurrentPlace().node.id);
+        EXPECT_GT(localVec.size(), 0);
+        EXPECT_LT(localVec.size(), workItems);
+        EXPECT_EQ(localVec.size(), staleVec.size() + 1);
+      });
+  EXPECT_EQ(perThreadVec.sizeAll(), workItems);
+  const std::uint64_t size = perThreadVec.sizeAll();
+
+  EXPECT_EQ(perThreadVec.computeIndices(), pando::Status::Success);
+  EXPECT_EQ(size, perThreadVec.sizeAll());
+
+  galois::WaitGroup wg;
+  EXPECT_EQ(wg.initialize(0), pando::Status::Success);
+  galois::doAll(
+      wg.getHandle(), State(wg.getHandle(), sum), perThreadVec,
+      +[](State state, pando::GlobalRef<pando::Vector<uint64_t>> vec) {
+        pando::Vector<uint64_t> v = vec;
+        for (uint64_t i = 0; i < v.size(); i++) {
+          EXPECT_LT(v[i], workItems);
+        }
+        galois::doAll(
+            state.first, state.second, v, +[](galois::DAccumulator<uint64_t> sum, uint64_t ref) {
+              EXPECT_LT(ref, workItems);
+              sum.add(ref);
+            });
+      });
+  EXPECT_EQ(wg.wait(), pando::Status::Success);
+  EXPECT_EQ(sum.reduce(), ((workItems - 1) + 0) * (workItems / 2));
+
+  galois::HostCachedArray<uint64_t> hca = PANDO_EXPECT_CHECK(perThreadVec.hostCachedFlatten());
+  EXPECT_EQ(hca.size(), workItems);
+  uint64_t copy_sum = 0;
+  for (uint64_t elt : hca) {
+    copy_sum += elt;
+  }
+  EXPECT_EQ(copy_sum, ((workItems - 1) + 0) * (workItems / 2));
+
+  hca.deinitialize();
+  sum.deinitialize();
+  work.deinitialize();
+  wg.deinitialize();
+  perThreadVec.deinitialize();
+}
+
+TEST(ThreadLocalVector, HostLocalStorageVector) {
+  constexpr std::uint64_t size = 32;
+  pando::Status err;
+
+  galois::ThreadLocalVector<std::uint64_t> ptv;
+  err = ptv.initialize();
+  EXPECT_EQ(err, pando::Status::Success);
+
+  galois::HostLocalStorage<std::uint64_t> phu{};
+
+  galois::doAll(
+      ptv, phu, +[](galois::ThreadLocalVector<std::uint64_t> ptv, std::uint64_t) {
+        galois::doAll(
+            ptv, galois::IotaRange(0, size),
+            +[](galois::ThreadLocalVector<std::uint64_t> ptv, std::uint64_t i) {
+              pando::Status err;
+              err = ptv.pushBack(i);
+              EXPECT_EQ(err, pando::Status::Success);
+            });
+      });
+
+  galois::HostLocalStorage<pando::Vector<std::uint64_t>> phv;
+  PANDO_CHECK(phv.initialize());
+  for (auto vecRef : phv) {
+    EXPECT_EQ(fmap(vecRef, initialize, 0), pando::Status::Success);
+  }
+
+  err = ptv.hostFlattenAppend(phv);
+  EXPECT_EQ(err, pando::Status::Success);
+
+  for (pando::GlobalRef<pando::Vector<std::uint64_t>> vecRef : phv) {
+    EXPECT_EQ(lift(vecRef, size), size);
+    std::sort(lift(vecRef, begin), lift(vecRef, end));
+    pando::Vector<std::uint64_t> vec = vecRef;
+    for (std::uint64_t i = 0; i < size; i++) {
+      EXPECT_EQ(vec[i], i);
+    }
+  }
+}
+
+TEST(ThreadLocalVector, Clear) {
+  constexpr std::uint64_t size = 32;
+  pando::Status err;
+
+  galois::ThreadLocalVector<std::uint64_t> ptv;
+  err = ptv.initialize();
+  EXPECT_EQ(err, pando::Status::Success);
+
+  galois::HostLocalStorage<std::uint64_t> phu{};
+
+  galois::doAll(
+      ptv, phu, +[](galois::ThreadLocalVector<std::uint64_t> ptv, std::uint64_t) {
+        galois::doAll(
+            ptv, galois::IotaRange(0, size),
+            +[](galois::ThreadLocalVector<std::uint64_t> ptv, std::uint64_t i) {
+              pando::Status err;
+              err = ptv.pushBack(i);
+              EXPECT_EQ(err, pando::Status::Success);
+            });
+      });
+
+  galois::DAccumulator<std::uint64_t> accum{};
+  EXPECT_EQ(accum.initialize(), pando::Status::Success);
+
+  err = galois::doAll(
+      accum, ptv,
+      +[](galois::DAccumulator<std::uint64_t> accum,
+          pando::GlobalRef<pando::Vector<std::uint64_t>> refVec) {
+        accum.add(lift(refVec, size));
+      });
+  EXPECT_EQ(err, pando::Status::Success);
+  EXPECT_EQ(accum.reduce(), size * static_cast<std::uint64_t>(pando::getPlaceDims().node.id));
+
+  ptv.clear();
+
+  galois::doAll(
+      ptv, +[](pando::GlobalRef<pando::Vector<std::uint64_t>> refVec) {
+        EXPECT_EQ(0, lift(refVec, size));
+      });
+
+  accum.deinitialize();
+  ptv.deinitialize();
+}
+
+TEST(ThreadLocalVector, ClearCompute) {
+  galois::ThreadLocalVector<uint64_t> perThreadVec;
+  EXPECT_EQ(perThreadVec.initialize(), pando::Status::Success);
+
+  static uint64_t workItems = 100;
+  galois::DistArray<uint64_t> work;
+  EXPECT_EQ(work.initialize(workItems), pando::Status::Success);
+  for (uint64_t i = 0; i < workItems; i++) {
+    work[i] = i;
+  }
+
+  galois::DAccumulator<uint64_t> sum{};
+  EXPECT_EQ(sum.initialize(), pando::Status::Success);
+  EXPECT_EQ(sum.get(), 0);
+
+  galois::doAll(
+      perThreadVec, work, +[](galois::ThreadLocalVector<uint64_t>& perThreadVec, uint64_t x) {
+        uint64_t originalID = pando::getCurrentThread().id;
+        EXPECT_GE(originalID, 0);
+        EXPECT_LT(originalID, pando::getThreadDims().id);
+        pando::Vector<uint64_t> staleVec = perThreadVec.getLocalRef();
+
+        EXPECT_EQ(perThreadVec.pushBack(x), pando::Status::Success);
+
+        pando::Vector<uint64_t> localVec = perThreadVec.getLocalRef();
+        EXPECT_EQ(pando::localityOf(localVec.data()).node.id, pando::getCurrentPlace().node.id);
+        EXPECT_GT(localVec.size(), 0);
+        EXPECT_LT(localVec.size(), workItems);
+        EXPECT_EQ(localVec.size(), staleVec.size() + 1);
+      });
+  EXPECT_EQ(perThreadVec.sizeAll(), workItems);
+
+  const std::uint64_t sizeAll0 = perThreadVec.sizeAll();
+  EXPECT_EQ(perThreadVec.computeIndices(), pando::Status::Success);
+  EXPECT_EQ(sizeAll0, perThreadVec.sizeAll());
+
+  galois::WaitGroup wg;
+  EXPECT_EQ(wg.initialize(0), pando::Status::Success);
+  galois::doAll(
+      wg.getHandle(), State(wg.getHandle(), sum), perThreadVec,
+      +[](State state, pando::GlobalRef<pando::Vector<uint64_t>> vec) {
+        pando::Vector<uint64_t> v = vec;
+        for (uint64_t i = 0; i < v.size(); i++) {
+          EXPECT_LT(v[i], workItems);
+        }
+        galois::doAll(
+            state.first, state.second, v, +[](galois::DAccumulator<uint64_t> sum, uint64_t ref) {
+              EXPECT_LT(ref, workItems);
+              sum.add(ref);
+            });
+      });
+  EXPECT_EQ(wg.wait(), pando::Status::Success);
+  EXPECT_EQ(sum.reduce(), ((workItems - 1) + 0) * (workItems / 2));
+
+  galois::HostCachedArray<uint64_t> hca = PANDO_EXPECT_CHECK(perThreadVec.hostCachedFlatten());
+  EXPECT_EQ(hca.size(), workItems);
+  uint64_t copy_sum = 0;
+  for (uint64_t elt : hca) {
+    copy_sum += elt;
+  }
+  EXPECT_EQ(copy_sum, ((workItems - 1) + 0) * (workItems / 2));
+
+  hca.deinitialize();
+  sum.deinitialize();
+  work.deinitialize();
+  wg.deinitialize();
+  perThreadVec.clear();
+
+  workItems = 100;
+  EXPECT_EQ(work.initialize(workItems), pando::Status::Success);
+  for (uint64_t i = 0; i < workItems; i++) {
+    work[i] = i;
+  }
+
+  EXPECT_EQ(sum.initialize(), pando::Status::Success);
+  EXPECT_EQ(sum.get(), 0);
+
+  galois::doAll(
+      perThreadVec, work, +[](galois::ThreadLocalVector<uint64_t>& perThreadVec, uint64_t x) {
+        uint64_t originalID = pando::getCurrentThread().id;
+        EXPECT_GE(originalID, 0);
+        EXPECT_LT(originalID, pando::getThreadDims().id);
+        pando::Vector<uint64_t> staleVec = perThreadVec.getLocalRef();
+
+        EXPECT_EQ(perThreadVec.pushBack(x), pando::Status::Success);
+
+        pando::Vector<uint64_t> localVec = perThreadVec.getLocalRef();
+        EXPECT_EQ(pando::localityOf(localVec.data()).node.id, pando::getCurrentPlace().node.id);
+        EXPECT_GT(localVec.size(), 0);
+        EXPECT_LT(localVec.size(), workItems);
+        EXPECT_EQ(localVec.size(), staleVec.size() + 1);
+      });
+  EXPECT_EQ(perThreadVec.sizeAll(), workItems);
+
+  const std::uint64_t sizeAll1 = perThreadVec.sizeAll();
+  EXPECT_EQ(perThreadVec.computeIndices(), pando::Status::Success);
+  EXPECT_EQ(sizeAll1, perThreadVec.sizeAll());
+
+  EXPECT_EQ(wg.initialize(0), pando::Status::Success);
+  galois::doAll(
+      wg.getHandle(), State(wg.getHandle(), sum), perThreadVec,
+      +[](State state, pando::GlobalRef<pando::Vector<uint64_t>> vec) {
+        pando::Vector<uint64_t> v = vec;
+        for (uint64_t i = 0; i < v.size(); i++) {
+          EXPECT_LT(v[i], workItems);
+        }
+        galois::doAll(
+            state.first, state.second, v, +[](galois::DAccumulator<uint64_t> sum, uint64_t ref) {
+              EXPECT_LT(ref, workItems);
+              sum.add(ref);
+            });
+      });
+  EXPECT_EQ(wg.wait(), pando::Status::Success);
+  EXPECT_EQ(sum.reduce(), ((workItems - 1) + 0) * (workItems / 2));
+
+  hca = PANDO_EXPECT_CHECK(perThreadVec.hostCachedFlatten());
+  EXPECT_EQ(hca.size(), workItems);
+  copy_sum = 0;
+  for (uint64_t elt : hca) {
+    copy_sum += elt;
+  }
+  EXPECT_EQ(copy_sum, ((workItems - 1) + 0) * (workItems / 2));
+
+  hca.deinitialize();
+  sum.deinitialize();
+  work.deinitialize();
+  wg.deinitialize();
+  perThreadVec.deinitialize();
+}
diff --git a/test/graphs/CMakeLists.txt b/test/graphs/CMakeLists.txt
index e127d446..e12a09fa 100644
--- a/test/graphs/CMakeLists.txt
+++ b/test/graphs/CMakeLists.txt
@@ -3,3 +3,5 @@
 
 pando_add_driver_test(test_dist_array_csr test_dist_array_csr.cpp)
 pando_add_driver_test(test_local_csr  test_local_csr.cpp)
+pando_add_driver_test(test_dist_local_csr test_dist_local_csr.cpp)
+pando_add_driver_test(test_mirror_dist_local_csr test_mirror_dist_local_csr.cpp)
diff --git a/test/graphs/test_mirror_dist_local_csr.cpp b/test/graphs/test_mirror_dist_local_csr.cpp
new file mode 100644
index 00000000..c50f25cb
--- /dev/null
+++ b/test/graphs/test_mirror_dist_local_csr.cpp
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023. University of Texas at Austin. All rights reserved.
+
+#include <gtest/gtest.h>
+
+#include <variant>
+
+#include "pando-rt/export.h"
+#include <pando-lib-galois/containers/dist_array.hpp>
+#include <pando-lib-galois/graphs/graph_traits.hpp>
+#include <pando-lib-galois/graphs/mirror_dist_local_csr.hpp>
+#include <pando-lib-galois/loops/do_all.hpp>
+#include <pando-lib-galois/sync/wait_group.hpp>
+#include <pando-rt/containers/vector.hpp>
+#include <pando-rt/memory/memory_guard.hpp>
+#include <pando-rt/pando-rt.hpp>
+#include <pando-rt/sync/notification.hpp>
+
+pando::Vector<pando::Vector<std::uint64_t>> generateFullyConnectedGraph(std::uint64_t SIZE) {
+  pando::Vector<pando::Vector<std::uint64_t>> vec;
+  EXPECT_EQ(vec.initialize(SIZE), pando::Status::Success);
+  for (pando::GlobalRef<pando::Vector<std::uint64_t>> edges : vec) {
+    pando::Vector<std::uint64_t> inner;
+    EXPECT_EQ(inner.initialize(0), pando::Status::Success);
+    edges = inner;
+  }
+
+  galois::doAll(
+      SIZE, vec, +[](std::uint64_t size, pando::GlobalRef<pando::Vector<std::uint64_t>> innerRef) {
+        pando::Vector<std::uint64_t> inner = innerRef;
+        for (std::uint64_t i = 0; i < size; i++) {
+          EXPECT_EQ(inner.pushBack(i), pando::Status::Success);
+        }
+        innerRef = inner;
+      });
+  return vec;
+}
+
+template <typename T>
+pando::Status deleteVectorVector(pando::Vector<pando::Vector<T>> vec) {
+  auto err = galois::doAll(
+      vec, +[](pando::GlobalRef<pando::Vector<T>> innerRef) {
+        pando::Vector<std::uint64_t> inner = innerRef;
+        inner.deinitialize();
+        innerRef = inner;
+      });
+  vec.deinitialize();
+  return err;
+}
+
+using Graph = galois::MirrorDistLocalCSR<std::uint64_t, std::uint64_t>;
+
+TEST(MirrorDistLocalCSR, NumVertices) {
+  constexpr std::uint64_t SIZE = 10;
+  Graph graph;
+  auto vec = generateFullyConnectedGraph(SIZE);
+
+  EXPECT_EQ(deleteVectorVector(vec), pando::Status::Success);
+}
diff --git a/test/import/test_cusp_importer.cpp b/test/import/test_cusp_importer.cpp
index d310bb89..94a2aa7f 100644
--- a/test/import/test_cusp_importer.cpp
+++ b/test/import/test_cusp_importer.cpp
@@ -6,6 +6,7 @@
 
 #include <numeric>
 
+#include <pando-lib-galois/containers/thread_local_storage.hpp>
 #include <pando-lib-galois/graphs/wmd_graph.hpp>
 #include <pando-lib-galois/import/ingest_rmat_el.hpp>
 #include <pando-lib-galois/import/ingest_wmd_csv.hpp>
@@ -671,7 +672,7 @@ TEST(loadGraphFilePerThread, loadGraph) {
   uint64_t segmentsPerThread = 1;
   galois::PerThreadVector<pando::Vector<galois::WMDEdge>> localEdges;
   EXPECT_EQ(localEdges.initialize(), pando::Status::Success);
-  galois::PerThreadVector<galois::WMDVertex> localVertices;
+  galois::ThreadLocalVector<galois::WMDVertex> localVertices;
   EXPECT_EQ(localVertices.initialize(), pando::Status::Success);
   pando::Array<char> filename;
   std::string wmdFile = "/pando/graphs/simple_wmd.csv";
@@ -679,25 +680,20 @@ TEST(loadGraphFilePerThread, loadGraph) {
   for (uint64_t i = 0; i < wmdFile.size(); i++)
     filename[i] = wmdFile[i];
 
-  galois::DistArray<galois::HashTable<std::uint64_t, std::uint64_t>> perThreadRename;
-  PANDO_CHECK(perThreadRename.initialize(localEdges.size()));
-  for (std::uint64_t i = 0; i < localEdges.size(); i++) {
+  galois::ThreadLocalStorage<galois::HashTable<std::uint64_t, std::uint64_t>> perThreadRename;
+  PANDO_CHECK(perThreadRename.initialize());
+  for (std::uint64_t i = 0; i < perThreadRename.size(); i++) {
     perThreadRename[i] = galois::HashTable<std::uint64_t, std::uint64_t>();
     pando::Status err = fmap(perThreadRename[i], initialize, 0);
     EXPECT_EQ(err, pando::Status::Success);
   }
 
-  for (std::uint64_t i = 0; i < numThreads; i++) {
-    perThreadRename[i] = galois::HashTable<std::uint64_t, std::uint64_t>();
-    pando::Status err = fmap(perThreadRename[i], initialize, 0);
-    EXPECT_EQ(err, pando::Status::Success);
-  }
-
-  galois::DAccumulator<std::uint64_t> totVerts;
+  galois::DAccumulator<std::uint64_t> totVerts{};
   EXPECT_EQ(totVerts.initialize(), pando::Status::Success);
 
   galois::WaitGroup wg;
   EXPECT_EQ(pando::Status::Success, wg.initialize(numThreads));
+
   auto wgh = wg.getHandle();
 
   for (uint64_t i = 0; i < numThreads; i++) {
@@ -713,13 +709,13 @@ TEST(loadGraphFilePerThread, loadGraph) {
 
   wg.deinitialize();
 
-  auto freeStuff =
-      +[](galois::DistArray<galois::HashTable<std::uint64_t, std::uint64_t>> perThreadRename) {
-        for (galois::HashTable<std::uint64_t, std::uint64_t> hash : perThreadRename) {
-          hash.deinitialize();
-        }
-        perThreadRename.deinitialize();
-      };
+  auto freeStuff = +[](galois::ThreadLocalStorage<galois::HashTable<std::uint64_t, std::uint64_t>>
+                           perThreadRename) {
+    for (galois::HashTable<std::uint64_t, std::uint64_t> hash : perThreadRename) {
+      hash.deinitialize();
+    }
+  };
+  perThreadRename.deinitialize();
   EXPECT_EQ(pando::Status::Success, pando::executeOn(pando::anyPlace, freeStuff, perThreadRename));
 
   uint64_t numVertices = 0;
@@ -727,7 +723,7 @@ TEST(loadGraphFilePerThread, loadGraph) {
   getNumVerticesAndEdges(wmdFile, numVertices, numEdges);
   uint64_t vert = 0;
   for (uint64_t i = 0; i < localVertices.size(); i++) {
-    pando::Vector<galois::WMDVertex> vec = *localVertices.get(i);
+    pando::Vector<galois::WMDVertex> vec = localVertices[i];
     vert += vec.size();
   }
   uint64_t edges = 0;
@@ -740,6 +736,7 @@ TEST(loadGraphFilePerThread, loadGraph) {
   EXPECT_EQ(vert, numVertices);
   EXPECT_EQ(edges, 2 * numEdges);
   totVerts.deinitialize();
+  localVertices.deinitialize();
 }
 
 TEST(loadGraphFilePerThread, loadEdgeList) {
@@ -756,9 +753,9 @@ TEST(loadGraphFilePerThread, loadEdgeList) {
 
   const std::uint64_t numThreads = localEdges.size() - pando::getPlaceDims().node.id;
 
-  galois::DistArray<galois::HashTable<std::uint64_t, std::uint64_t>> perThreadRename{};
-  PANDO_CHECK(perThreadRename.initialize(localEdges.size()));
-  for (std::uint64_t i = 0; i < localEdges.size(); i++) {
+  galois::ThreadLocalStorage<galois::HashTable<std::uint64_t, std::uint64_t>> perThreadRename{};
+  PANDO_CHECK(perThreadRename.initialize());
+  for (std::uint64_t i = 0; i < perThreadRename.size(); i++) {
     perThreadRename[i] = galois::HashTable<std::uint64_t, std::uint64_t>();
     pando::Status err = fmap(perThreadRename[i], initialize, 0);
     EXPECT_EQ(err, pando::Status::Success);
@@ -778,14 +775,14 @@ TEST(loadGraphFilePerThread, loadEdgeList) {
   }
   EXPECT_EQ(wg.wait(), pando::Status::Success);
 
-  auto freeStuff =
-      +[](galois::DistArray<galois::HashTable<std::uint64_t, std::uint64_t>> perThreadRename) {
-        for (galois::HashTable<std::uint64_t, std::uint64_t> hash : perThreadRename) {
-          hash.deinitialize();
-        }
-        perThreadRename.deinitialize();
-      };
+  auto freeStuff = +[](galois::ThreadLocalStorage<galois::HashTable<std::uint64_t, std::uint64_t>>
+                           perThreadRename) {
+    for (galois::HashTable<std::uint64_t, std::uint64_t> hash : perThreadRename) {
+      hash.deinitialize();
+    }
+  };
   EXPECT_EQ(pando::Status::Success, pando::executeOn(pando::anyPlace, freeStuff, perThreadRename));
+  perThreadRename.deinitialize();
 
   uint64_t numEdges = getNumEdges(edgelistFile);
   uint64_t edges = 0;
diff --git a/test/import/test_wmd_importer.cpp b/test/import/test_wmd_importer.cpp
index 72f00847..4f7085d7 100644
--- a/test/import/test_wmd_importer.cpp
+++ b/test/import/test_wmd_importer.cpp
@@ -5,6 +5,7 @@
 #include <pando-rt/export.h>
 
 #include <numeric>
+#include <pando-lib-galois/containers/hashtable.hpp>
 #include <pando-lib-galois/graphs/dist_local_csr.hpp>
 #include <pando-lib-galois/graphs/wmd_graph.hpp>
 #include <pando-lib-galois/import/ingest_rmat_el.hpp>
@@ -198,7 +199,8 @@ TEST_P(DLCSRInitEdgeList, initializeEL) {
   for (uint64_t i = 0; i < elFile.size(); i++)
     filename[i] = elFile[i];
 
-  Graph graph = galois::initializeELDLCSR<galois::ELVertex, galois::ELEdge>(filename, numVertices);
+  Graph graph =
+      galois::initializeELDLCSR<Graph, galois::ELVertex, galois::ELEdge>(filename, numVertices);
 
   // Validate
   std::unordered_map<std::uint64_t, std::vector<std::uint64_t>> goldenTable;
@@ -270,3 +272,107 @@ INSTANTIATE_TEST_SUITE_P(
         std::make_tuple("/pando/graphs/rmat_571919_seed1_scale16_nV65536_nE909846.el", 65536),
         std::make_tuple("/pando/graphs/rmat_571919_seed1_scale17_nV131072_nE1864704.el", 131072),
         std::make_tuple("/pando/graphs/rmat_571919_seed1_scale18_nV262144_nE3806162.el", 262144)));
+
+class MirrorDLCSRInitEdgeList
+    : public ::testing::TestWithParam<std::tuple<const char*, std::uint64_t>> {};
+TEST_P(MirrorDLCSRInitEdgeList, initializeEL) {
+  using ET = galois::ELEdge;
+  using VT = galois::ELVertex;
+  using Graph = galois::MirrorDistLocalCSR<VT, ET>;
+  galois::HostLocalStorageHeap::HeapInit();
+
+  const std::string elFile = std::get<0>(GetParam());
+  const std::uint64_t numVertices = std::get<1>(GetParam());
+
+  pando::Array<char> filename;
+  EXPECT_EQ(pando::Status::Success, filename.initialize(elFile.size()));
+  for (uint64_t i = 0; i < elFile.size(); i++)
+    filename[i] = elFile[i];
+
+  Graph graph =
+      galois::initializeELDLCSR<Graph, galois::ELVertex, galois::ELEdge>(filename, numVertices);
+
+  // Validate
+  std::unordered_map<std::uint64_t, std::vector<std::uint64_t>> goldenTable;
+  getVerticesAndEdgesEL(elFile, numVertices, goldenTable);
+  EXPECT_EQ(goldenTable.size(), graph.size());
+
+  // Iterate over vertices
+  std::uint64_t vid = 0;
+
+  for (typename Graph::VertexTopologyID vert : graph.vertices()) {
+    EXPECT_EQ(vid, graph.getVertexIndex(vert));
+    vid++;
+    typename Graph::VertexTokenID srcTok = graph.getTokenID(vert);
+
+    EXPECT_LT(srcTok, numVertices);
+
+    typename Graph::VertexData vertexData = graph.getData(vert);
+    EXPECT_EQ(srcTok, vertexData.id);
+
+    VT dumbVertex = VT{numVertices};
+    graph.setData(vert, dumbVertex);
+    vertexData = graph.getData(vert);
+    EXPECT_EQ(vertexData.id, numVertices);
+
+    // Iterate over edges
+    EXPECT_NE(goldenTable.find(srcTok), goldenTable.end())
+        << "Failed to find edges with tok_id:" << srcTok << "\t with index: " << (vid - 1);
+    std::vector<std::uint64_t> goldenEdges = goldenTable[srcTok];
+
+    for (typename Graph::EdgeHandle eh : graph.edges(vert)) {
+      typename Graph::EdgeData eData = graph.getEdgeData(eh);
+
+      EXPECT_EQ(eData.src, srcTok);
+
+      typename Graph::VertexTokenID dstTok = graph.getTokenID(graph.getEdgeDst(eh));
+      EXPECT_EQ(eData.dst, dstTok);
+
+      auto mirrorTopology = graph.getTopologyID(dstTok);
+      auto masterTopology = graph.getGlobalTopologyID(dstTok);
+      if (mirrorTopology != masterTopology) {
+        // If global, and local have different value.
+        // It means current one have mirror. Mirror is local, but master is not.
+        ASSERT_TRUE(graph.isLocal(mirrorTopology));
+        ASSERT_TRUE(!graph.isLocal(masterTopology));
+        // Mirror must exist in mirror range.
+        auto it = graph.getMirrorRange();
+        ASSERT_TRUE(*it.begin() <= mirrorTopology && mirrorTopology < *it.end());
+      } else {
+        // If I don't have mirror, that could be because it is in local, or never be a destination
+        // from me.
+        if (graph.isLocal(masterTopology)) {
+          // If it is from me, it is in my master range.
+          auto it = graph.getMasterRange();
+          ASSERT_TRUE(*it.begin() <= masterTopology && masterTopology < *it.end());
+          // In mirror to master, this should never exist
+        }
+      }
+
+      auto goldenEdgeIt = std::find(goldenEdges.begin(), goldenEdges.end(), dstTok);
+      EXPECT_NE(goldenEdgeIt, goldenEdges.end())
+          << "Unable to find edge with src_tok: " << srcTok << "\tand dst_tok: " << dstTok
+          << "\tat vertex: " << (vid - 1);
+      ET dumbEdge = ET{numVertices, numVertices};
+      graph.setEdgeData(eh, dumbEdge);
+      eData = graph.getEdgeData(eh);
+      EXPECT_EQ(eData.src, numVertices);
+      EXPECT_EQ(eData.dst, numVertices);
+    }
+  }
+  graph.deinitialize();
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    SmallFiles, MirrorDLCSRInitEdgeList,
+    ::testing::Values(std::make_tuple("/pando/graphs/simple.el", 10),
+                      std::make_tuple("/pando/graphs/rmat_571919_seed1_scale10_nV1024_nE10447.el",
+                                      1024)));
+
+INSTANTIATE_TEST_SUITE_P(
+    DISABLED_BigFiles, MirrorDLCSRInitEdgeList,
+    ::testing::Values(
+        std::make_tuple("/pando/graphs/rmat_571919_seed1_scale11_nV2048_nE22601.el", 2048),
+        std::make_tuple("/pando/graphs/rmat_571919_seed1_scale12_nV4096_nE48335.el", 4096),
+        std::make_tuple("/pando/graphs/rmat_571919_seed1_scale13_nV8192_nE102016.el", 8192),
+        std::make_tuple("/pando/graphs/rmat_571919_seed1_scale14_nV16384_nE213350.el", 16384)));
diff --git a/test/sync/test_simple_lock.cpp b/test/sync/test_simple_lock.cpp
index 3d180377..a65cb82b 100644
--- a/test/sync/test_simple_lock.cpp
+++ b/test/sync/test_simple_lock.cpp
@@ -6,9 +6,11 @@
 
 #include <cstdint>
 
+#include <pando-lib-galois/containers/host_local_storage.hpp>
 #include <pando-lib-galois/loops/do_all.hpp>
 #include <pando-lib-galois/sync/global_barrier.hpp>
 #include <pando-lib-galois/sync/simple_lock.hpp>
+#include <pando-lib-galois/utility/tuple.hpp>
 #include <pando-rt/containers/vector.hpp>
 #include <pando-rt/memory/global_ptr.hpp>
 #include <pando-rt/pando-rt.hpp>
@@ -37,7 +39,7 @@ TEST(SimpleLock, TryLock) {
 
 TEST(SimpleLock, SimpleLockUnlock) {
   auto test = [] {
-    galois::SimpleLock mutex;
+    galois::SimpleLock mutex{};
     EXPECT_EQ(mutex.initialize(), pando::Status::Success);
     mutex.lock();
     mutex.unlock();
@@ -52,45 +54,36 @@ TEST(SimpleLock, SimpleLockUnlock) {
 }
 
 TEST(SimpleLock, ActualLockUnlock) {
-  auto dims = pando::getPlaceDims();
-  galois::GlobalBarrier gb;
-  EXPECT_EQ(gb.initialize(dims.node.id), pando::Status::Success);
   galois::SimpleLock mutex;
   EXPECT_EQ(mutex.initialize(), pando::Status::Success);
   pando::Array<int> array;
   EXPECT_EQ(array.initialize(10), pando::Status::Success);
   array.fill(0);
 
-  auto func = +[](galois::GlobalBarrier gb, galois::SimpleLock mutex, pando::Array<int> array) {
-    mutex.lock();
-    for (int i = 0; i < 10; i++) {
-      if ((i + 1 + pando::getCurrentPlace().node.id) <= 10) {
-        array[i] = i + 1 + pando::getCurrentPlace().node.id;
-      } else {
-        array[i] = i - 9 + pando::getCurrentPlace().node.id;
-      }
-    }
-    mutex.unlock();
-    gb.done();
-  };
-  for (std::int16_t nodeId = 0; nodeId < dims.node.id; nodeId++) {
-    EXPECT_EQ(
-        pando::executeOn(pando::Place{pando::NodeIndex{nodeId}, pando::anyPod, pando::anyCore},
-                         func, gb, mutex, array),
-        pando::Status::Success);
-  }
+  galois::HostLocalStorage<std::uint64_t> hls{};
+  auto tpl = galois::make_tpl(mutex, array);
+  EXPECT_EQ(galois::doAll(
+                tpl, hls,
+                +[](decltype(tpl) tpl, pando::GlobalRef<std::uint64_t>) {
+                  auto [mutex, array] = tpl;
+                  mutex.lock();
+                  for (int i = 0; i < 10; i++) {
+                    if ((i + 1 + pando::getCurrentPlace().node.id) <= 10) {
+                      array[i] = i + 1 + pando::getCurrentPlace().node.id;
+                    } else {
+                      array[i] = i - 9 + pando::getCurrentPlace().node.id;
+                    }
+                  }
+                  mutex.unlock();
+                }),
+            pando::Status::Success);
 
-  EXPECT_EQ(gb.wait(), pando::Status::Success);
-  for (int i = 0; i < 10; i++) {
-    std::cout << array[i] << " ";
-  }
-  std::cout << std::endl;
   int sum = 0;
   for (int i = 0; i < 10; i++) {
     sum += array[i];
   }
   EXPECT_EQ(sum, 55);
 
-  gb.deinitialize();
   array.deinitialize();
+  mutex.deinitialize();
 }
diff --git a/test/test_mirror_master_table.cpp b/test/test_mirror_master_table.cpp
new file mode 100644
index 00000000..0e0c2f12
--- /dev/null
+++ b/test/test_mirror_master_table.cpp
@@ -0,0 +1,117 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023. University of Texas at Austin. All rights reserved.
+
+#include <getopt.h>
+#include <pando-rt/export.h>
+
+#include <cstdint>
+#include <iostream>
+#include <numeric>
+
+#include <pando-lib-galois/containers/hashtable.hpp>
+#include <pando-lib-galois/graphs/dist_local_csr.hpp>
+#include <pando-lib-galois/graphs/edge_list_importer.hpp>
+#include <pando-lib-galois/graphs/wmd_graph.hpp>
+#include <pando-lib-galois/import/ingest_rmat_el.hpp>
+#include <pando-lib-galois/import/ingest_wmd_csv.hpp>
+#include <pando-lib-galois/import/wmd_graph_importer.hpp>
+#include <pando-rt/memory/memory_guard.hpp>
+#include <pando-rt/memory_resource.hpp>
+#include <pando-rt/pando-rt.hpp>
+
+void printUsageExit(char* argv0) {
+  std::cerr << "Usage: " << argv0 << " -n numVertices -f filepath" << std::endl;
+  std::exit(EXIT_FAILURE);
+}
+
+template <typename T>
+using GV = pando::GlobalPtr<pando::Vector<T>>;
+
+template <typename T>
+using V = pando::Vector<T>;
+
+template <typename T>
+using G = pando::GlobalPtr<T>;
+void runTest(const char* elFile, std::uint64_t numVertices);
+
+int pandoMain(int argc, char** argv) {
+  std::uint64_t numVertices = 0;
+  char* filepath = nullptr;
+  optind = 0;
+  int opt;
+
+  while ((opt = getopt(argc, argv, "n:f:")) != -1) {
+    switch (opt) {
+      case 'n':
+        numVertices = strtoull(optarg, nullptr, 10);
+        break;
+      case 'f':
+        filepath = optarg;
+        break;
+      default:
+        printUsageExit(argv[0]);
+    }
+  }
+  if (numVertices == 0) {
+    printUsageExit(argv[0]);
+  }
+  if (filepath == nullptr) {
+    printUsageExit(argv[0]);
+  }
+  runTest(filepath, numVertices);
+  return 0;
+}
+
+void runTest(const char* elFile, std::uint64_t numVertices) {
+  using ET = galois::ELEdge;
+  using VT = galois::ELVertex;
+  using Graph = galois::MirrorDistLocalCSR<VT, ET>;
+  galois::HostLocalStorageHeap::HeapInit();
+  galois::PodLocalStorageHeap::HeapInit();
+  pando::Array<char> filename;
+  std::size_t length = strlen(elFile);
+  PANDO_CHECK(filename.initialize(length + 1));
+  for (std::size_t i = 0; i < length; i++) {
+    filename[i] = elFile[i];
+  }
+  filename[length] = '\0'; // Ensure the string is null-terminated
+
+  if (pando::getCurrentPlace().node.id == 0) {
+    Graph graph =
+        galois::initializeELDLCSR<Graph, galois::ELVertex, galois::ELEdge>(filename, numVertices);
+    // Iterate over vertices
+    std::uint64_t vid = 0;
+    auto mirror_master_array = graph.getLocalMirrorToRemoteMasterOrderedTable();
+    for (auto elem : mirror_master_array) {
+      std::cout << "SET, " << lift(elem, getMirror).address << ", " << lift(elem, getMaster).address
+                << std::endl;
+    }
+
+    for (typename Graph::VertexTopologyID vert : graph.vertices()) {
+      vid++;
+      for (typename Graph::EdgeHandle eh : graph.edges(vert)) {
+        typename Graph::VertexTokenID dstTok = graph.getTokenID(graph.getEdgeDst(eh));
+
+        auto mirrorTopology = graph.getTopologyID(dstTok);
+        auto masterTopology = graph.getGlobalTopologyID(dstTok);
+        if (mirrorTopology != masterTopology) {
+          // If global, and local have different value.
+          // It means current one have mirror. Mirror is local, but master is not.
+          std::cout << "TRUE, " << mirrorTopology.address << ", " << masterTopology.address
+                    << std::endl;
+        } else {
+          // If I don't have mirror, that could be because it is in local, or never be a destination
+          // from me.
+          if (graph.isLocal(masterTopology)) {
+            // If it is from me, it is in my master range.
+            std::cout << "FALSE, " << mirrorTopology.address << ", " << masterTopology.address
+                      << std::endl;
+          }
+        }
+      }
+    }
+    graph.deinitialize();
+  }
+  pando::waitAll();
+}
diff --git a/test/utility/test_gptr_monad.cpp b/test/utility/test_gptr_monad.cpp
index 1d23906c..d771cd4e 100644
--- a/test/utility/test_gptr_monad.cpp
+++ b/test/utility/test_gptr_monad.cpp
@@ -27,6 +27,14 @@ TEST(Fmap, GVectorInitialize) {
   pando::deallocateMemory(gvec, 1);
 }
 
+TEST(Fmap, VectorInitialize) {
+  constexpr std::uint64_t SIZE = 10;
+  pando::Vector<std::uint64_t> vec;
+  fmap(vec, initialize, SIZE);
+  EXPECT_EQ(vec.size(), SIZE);
+  vec.deinitialize();
+}
+
 TEST(Fmap, GVectorPushBack) {
   constexpr std::uint64_t SIZE = 10;
   pando::GlobalPtr<pando::Vector<std::uint64_t>> gvec;
@@ -54,6 +62,25 @@ TEST(Fmap, GVectorPushBack) {
   pando::deallocateMemory(gvec, 1);
 }
 
+TEST(Fmap, VectorPushBack) {
+  constexpr std::uint64_t SIZE = 10;
+  pando::Vector<std::uint64_t> vec;
+  PANDO_CHECK(fmap(vec, initialize, 0));
+
+  for (std::uint64_t i = 0; i < SIZE; i++) {
+    PANDO_CHECK(fmap(vec, pushBack, i));
+  }
+
+  EXPECT_EQ(vec.size(), SIZE);
+  std::uint64_t i = 0;
+  for (std::uint64_t v : vec) {
+    EXPECT_EQ(v, i);
+    i++;
+  }
+  vec.deinitialize();
+  EXPECT_EQ(SIZE, i);
+}
+
 pando::Vector<pando::Vector<std::uint64_t>> generateFullyConnectedGraph(std::uint64_t SIZE) {
   pando::Vector<pando::Vector<std::uint64_t>> vec;
   EXPECT_EQ(vec.initialize(SIZE), pando::Status::Success);
@@ -113,6 +140,29 @@ TEST(FmapVoid, GDistArrayCSR) {
   liftVoid(*ggraph, deinitialize);
 }
 
+TEST(FmapVoid, DistArrayCSR) {
+  constexpr std::uint64_t SIZE = 10;
+  Graph graph{};
+  auto vvec = generateFullyConnectedGraph(SIZE);
+  PANDO_CHECK(fmap(graph, initialize, vvec));
+  PANDO_CHECK(deleteVectorVector(vvec));
+
+  for (std::uint64_t i = 0; i < SIZE; i++) {
+    fmapVoid(graph, setData, i, i);
+    for (std::uint64_t j = 0; j < SIZE; j++) {
+      fmapVoid(graph, setEdgeData, i, j, i * j);
+    }
+  }
+
+  for (std::uint64_t i = 0; i < SIZE; i++) {
+    EXPECT_EQ(fmap(graph, getData, i), i);
+    for (std::uint64_t j = 0; j < SIZE; j++) {
+      EXPECT_EQ(fmap(graph, getEdgeData, i, j), i * j);
+    }
+  }
+  liftVoid(graph, deinitialize);
+}
+
 TEST(Lift, GVectorSize) {
   constexpr std::uint64_t SIZE = 10;
   pando::GlobalPtr<pando::Vector<std::uint64_t>> gvec;
@@ -130,6 +180,14 @@ TEST(Lift, GVectorSize) {
   pando::deallocateMemory(gvec, 1);
 }
 
+TEST(Lift, VectorSize) {
+  constexpr std::uint64_t SIZE = 10;
+  pando::Vector<std::uint64_t> vec;
+  PANDO_CHECK(fmap(vec, initialize, SIZE));
+  EXPECT_EQ(lift(vec, size), SIZE);
+  vec.deinitialize();
+}
+
 TEST(LiftVoid, GVectorDeinitialize) {
   constexpr std::uint64_t SIZE = 10;
   pando::GlobalPtr<pando::Vector<std::uint64_t>> gvec;
@@ -145,6 +203,14 @@ TEST(LiftVoid, GVectorDeinitialize) {
   pando::deallocateMemory(gvec, 1);
 }
 
+TEST(LiftVoid, VectorDeinitialize) {
+  constexpr std::uint64_t SIZE = 10;
+  pando::Vector<std::uint64_t> vec;
+  PANDO_CHECK(fmap(vec, initialize, SIZE));
+  EXPECT_EQ(lift(vec, size), SIZE);
+  liftVoid(vec, deinitialize);
+}
+
 TEST(PANDO_EXPECT_RETURN, Success) {
   auto success = +[]() -> pando::Status {
     const std::int32_t value = 42;
diff --git a/test/utility/test_prefix_sum.cpp b/test/utility/test_prefix_sum.cpp
index b000aacb..f017b484 100644
--- a/test/utility/test_prefix_sum.cpp
+++ b/test/utility/test_prefix_sum.cpp
@@ -50,7 +50,7 @@ TEST(PrefixSum, Init) {
   galois::PrefixSum<SRC, DST, SRC_Val, DST_Val, transmute<uint64_t>, scan_op<SRC_Val, DST_Val>,
                     combiner<DST_Val>, galois::DistArray>
       prefixSum(arr, prefixArr);
-  EXPECT_EQ(prefixSum.initialize(), pando::Status::Success);
+  EXPECT_EQ(prefixSum.initialize(pando::getPlaceDims().node.id), pando::Status::Success);
   prefixSum.computePrefixSum(elts);
 
   uint64_t expected = 0;
@@ -76,7 +76,7 @@ TEST(PrefixSum, PerThread) {
   galois::PrefixSum<SRC, DST, SRC_Val, DST_Val, transmuteV, scan_opV, combiner<DST_Val>,
                     galois::DistArray>
       prefixSum(arr.m_data, prefixArr);
-  EXPECT_EQ(prefixSum.initialize(), pando::Status::Success);
+  EXPECT_EQ(prefixSum.initialize(pando::getPlaceDims().node.id), pando::Status::Success);
   prefixSum.computePrefixSum(prefixArr.size());
   EXPECT_EQ(prefixArr[prefixArr.size() - 1], arr.sizeAll());
 }
@@ -97,7 +97,7 @@ TEST(PrefixSum, Array) {
                                    scan_op<SRC_VAL, DST_VAL>, combiner<DST_VAL>, galois::Array>;
   PFXSUM pfxsum(arr, arr);
 
-  PANDO_CHECK(pfxsum.initialize());
+  PANDO_CHECK(pfxsum.initialize(pando::getPlaceDims().core.x * pando::getPlaceDims().core.y));
 
   pfxsum.computePrefixSum(size);