Skip to content

Commit

Permalink
Fix glob behaviour (#422)
Browse files Browse the repository at this point in the history
* remove _glob override

* tests: update glob tests

* remove _glob_find

* fix _find for .../A=1/B=2/ paths

* fix linting

* test: adjust returned dir

* require minimum fsspec>=2023.9.0
  • Loading branch information
ap-- authored Sep 16, 2023
1 parent 4b704b1 commit a1b8b52
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 154 deletions.
140 changes: 17 additions & 123 deletions adlfs/spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -590,84 +590,6 @@ async def _info(self, path, refresh=False, **kwargs):

raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), fullpath)

def glob(self, path, **kwargs):
return sync(self.loop, self._glob, path)

async def _glob(self, path, **kwargs):
"""
Find files by glob-matching.
If the path ends with '/' and does not contain "*", it is essentially
the same as ``ls(path)``, returning only files.
We support ``"**"``,
``"?"`` and ``"[..]"``.
kwargs are passed to ``ls``.
"""
import re

ends = path.endswith("/")
path = self._strip_protocol(path)
indstar = path.find("*") if path.find("*") >= 0 else len(path)
indques = path.find("?") if path.find("?") >= 0 else len(path)
indbrace = path.find("[") if path.find("[") >= 0 else len(path)

ind = min(indstar, indques, indbrace)

detail = kwargs.pop("detail", False)

if not has_magic(path):
root = path
depth = 1
if ends:
path += "/*"
elif await self._exists(path):
if not detail:
return [path]
else:
return {path: await self._info(path)}
else:
if not detail:
return [] # glob of non-existent returns empty
else:
return {}
elif "/" in path[:ind]:
ind2 = path[:ind].rindex("/")
root = path[: ind2 + 1]
depth = 20 if "**" in path else path[ind2 + 1 :].count("/") + 1
else:
root = ""
depth = 20 if "**" in path else 1

allpaths = await self._glob_find(
root, maxdepth=depth, withdirs=True, detail=True, **kwargs
)
pattern = (
"^"
+ (
path.replace("\\", r"\\")
.replace(".", r"\.")
.replace("+", r"\+")
.replace("//", "/")
.replace("(", r"\(")
.replace(")", r"\)")
.replace("|", r"\|")
.rstrip("/")
.replace("?", ".")
)
+ "$"
)
pattern = re.sub("[*]{2}", "=PLACEHOLDER=", pattern)
pattern = re.sub("[*]", "[^/]*", pattern)
pattern = re.compile(pattern.replace("=PLACEHOLDER=", ".*"))
out = {
p: allpaths[p]
for p in sorted(allpaths)
if pattern.match(p.replace("//", "/").rstrip("/"))
}
if detail:
return out
else:
return list(out)

async def _ls_containers(self, return_glob: bool = False):
if _ROOT_PATH not in self.dircache or return_glob:
# This is the case where only the containers are being returned
Expand Down Expand Up @@ -982,15 +904,23 @@ async def _find(self, path, withdirs=False, prefix="", with_parent=False, **kwar
infos = []

for info in infos:
name = info["name"]
parent_dir = self._parent(name).rstrip("/") + "/"
if parent_dir not in dir_set and parent_dir != full_path.strip("/"):
dir_set.add(parent_dir)
dirs[parent_dir] = {
"name": parent_dir,
"type": "directory",
"size": 0,
}
name = _name = info["name"]
while True:
parent_dir = self._parent(_name).rstrip("/") + "/"
if (
parent_dir not in dir_set
and parent_dir != full_path.strip("/") + "/"
):
dir_set.add(parent_dir)
dirs[parent_dir] = {
"name": parent_dir,
"type": "directory",
"size": 0,
}
_name = parent_dir.rstrip("/")
else:
break

if info["type"] == "directory":
dirs[name] = info
if info["type"] == "file":
Expand All @@ -1014,42 +944,6 @@ async def _find(self, path, withdirs=False, prefix="", with_parent=False, **kwar
return names
return {name: files[name] for name in names}

async def _glob_find(self, path, maxdepth=None, withdirs=False, **kwargs):
"""List all files below path in a recusrsive manner.
Like posix ``find`` command without conditions
Parameters
----------
path : str
maxdepth: int or None
If not None, the maximum number of levels to descend
withdirs: bool
Whether to include directory paths in the output. This is True
when used by glob, but users usually only want files.
kwargs are passed to ``ls``.
"""
# TODO: allow equivalent of -name parameter
path = self._strip_protocol(path)
out = dict()
detail = kwargs.pop("detail", False)
async for path, dirs, files in self._async_walk(
path, maxdepth, detail=True, **kwargs
):
if files == []:
files = {}
dirs = {}
if withdirs:
files.update(dirs)
out.update({info["name"]: info for name, info in files.items()})
if await self._isfile(path) and path not in out:
# walk works on directories, but find should also return [path]
# when path happens to be a file
out[path] = {}
names = sorted(out)
if not detail:
return names
else:
return {name: out[name] for name in names}

def _walk(self, path, dirs, files):
for p, d, f in zip([path], [dirs], [files]):
yield p, d, f
Expand Down
53 changes: 24 additions & 29 deletions adlfs/tests/test_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -547,24 +547,16 @@ def test_glob(storage):

# just the directory name
assert fs.glob("data/root") == ["data/root"]
assert fs.glob("data/root/") == ["data/root/"]

# top-level contents of a directory
assert fs.glob("data/root/") == [
"data/root/a",
"data/root/a1",
"data/root/b",
"data/root/c",
"data/root/d",
"data/root/e+f",
"data/root/rfile.txt",
]
assert fs.glob("data/root/*") == [
"data/root/a",
"data/root/a1",
"data/root/b",
"data/root/c",
"data/root/d",
"data/root/e+f",
"data/root/a/",
"data/root/a1/",
"data/root/b/",
"data/root/c/",
"data/root/d/",
"data/root/e+f/",
"data/root/rfile.txt",
]

Expand Down Expand Up @@ -609,40 +601,41 @@ def test_glob(storage):
"data/root/e+f/file1.txt",
"data/root/e+f/file2.txt",
"data/root/rfile.txt",
"data/top_file.txt",
]

# all files
assert fs.glob("data/root/**") == [
"data/root/a",
"data/root/a/",
"data/root/a/file.txt",
"data/root/a1",
"data/root/a1/",
"data/root/a1/file1.txt",
"data/root/b",
"data/root/b/",
"data/root/b/file.txt",
"data/root/c",
"data/root/c/",
"data/root/c/file1.txt",
"data/root/c/file2.txt",
"data/root/d",
"data/root/d/",
"data/root/d/file_with_metadata.txt",
"data/root/e+f",
"data/root/e+f/",
"data/root/e+f/file1.txt",
"data/root/e+f/file2.txt",
"data/root/rfile.txt",
]
assert fs.glob("data/roo**") == [
"data/root",
"data/root/a",
"data/root/",
"data/root/a/",
"data/root/a/file.txt",
"data/root/a1",
"data/root/a1/",
"data/root/a1/file1.txt",
"data/root/b",
"data/root/b/",
"data/root/b/file.txt",
"data/root/c",
"data/root/c/",
"data/root/c/file1.txt",
"data/root/c/file2.txt",
"data/root/d",
"data/root/d/",
"data/root/d/file_with_metadata.txt",
"data/root/e+f",
"data/root/e+f/",
"data/root/e+f/file1.txt",
"data/root/e+f/file2.txt",
"data/root/rfile.txt",
Expand All @@ -666,6 +659,7 @@ def test_glob_full_uri(storage):
"data/root/e+f/file1.txt",
"data/root/e+f/file2.txt",
"data/root/rfile.txt",
"data/top_file.txt",
]

assert fs.glob("account.dfs.core.windows.net/data/**/*.txt") == [
Expand All @@ -678,6 +672,7 @@ def test_glob_full_uri(storage):
"data/root/e+f/file1.txt",
"data/root/e+f/file2.txt",
"data/root/rfile.txt",
"data/top_file.txt",
]


Expand Down Expand Up @@ -1196,7 +1191,7 @@ def test_dask_parquet(storage):
write_metadata_file=True,
)
assert fs.glob("test/test_group3.parquet/*") == [
"test/test_group3.parquet/A=1",
"test/test_group3.parquet/A=1/",
"test/test_group3.parquet/_common_metadata",
"test/test_group3.parquet/_metadata",
]
Expand Down
2 changes: 1 addition & 1 deletion requirements/earliest.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-r base.txt
fsspec==2021.10.1
fsspec==2023.9.0
azure-core==1.23.1
azure-datalake-store==0.0.46
azure-storage-blob==12.12.0
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
"azure-datalake-store>=0.0.46,<0.1",
"azure-identity",
"azure-storage-blob>=12.12.0",
"fsspec>=2021.10.1",
"fsspec>=2023.9.0",
"aiohttp>=3.7.0",
],
extras_require={
Expand Down

0 comments on commit a1b8b52

Please sign in to comment.