Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add maven support #25

Draft
wants to merge 7 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions flake.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion flake.nix
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@

in
{
devShells.x86_64-linux.default = pkgs.mkShell { packages = [ pythonEnv ]; };
devShells.x86_64-linux.default = pkgs.mkShell { packages = [
pythonEnv pkgs.maven pkgs.yarn pkgs.pnpm
]; };
};
}
65 changes: 65 additions & 0 deletions tool/extract_deps.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,3 +329,68 @@ def get_patches_info(yarn_lock_file):
logging.info("Number of patches: %d", len(patches_info))

return patches_info


def extract_deps_from_maven(pom_xml_content):
"""
Extract dependencies from a Maven pom.xml file.

Args:
pom_xml_content (str): The content of the pom.xml file.

Returns:
dict: A dictionary containing the extracted dependencies.
"""
try:
deps_list = []
properties = {}

# Extract properties
prop_pattern = r'<properties>(.*?)</properties>'
prop_matches = re.findall(prop_pattern, pom_xml_content, re.DOTALL)
for prop_content in prop_matches:
print(f"Properties content: {prop_content}")
prop_items = re.findall(r'<([^>]+)>(.*?)</\1>', prop_content, re.DOTALL)
properties.update(dict(prop_items))

# Extract parent version if exists
parent_version_pattern = r'<parent>.*?<version>(.*?)</version>.*?</parent>'
parent_version_match = re.search(parent_version_pattern, pom_xml_content, re.DOTALL)
if parent_version_match:
properties['project.version'] = parent_version_match.group(1)

# Extract parent artifactId if exists
parent_artifactId_pattern = r'<parent>.*?<artifactId>(.*?)</artifactId>.*?</parent>'
parent_artifactId_match = re.search(parent_artifactId_pattern, pom_xml_content, re.DOTALL)
if parent_artifactId_match:
properties['project.artifactId'] = parent_artifactId_match.group(1)

# Extract parent groupId if exists
parent_groupId_pattern = r'<parent>.*?<groupId>(.*?)</groupId>.*?</parent>'
parent_groupId_match = re.search(parent_groupId_pattern, pom_xml_content, re.DOTALL)
if parent_groupId_match:
properties['project.groupId'] = parent_groupId_match.group(1)

pattern = r'<dependency>.*?<groupId>(.*?)</groupId>.*?<artifactId>(.*?)</artifactId>.*?<version>(.*?)</version>.*?</dependency>'
matches = re.findall(pattern, pom_xml_content, re.DOTALL)

for group_id, artifact_id, version in matches:
# Resolve property placeholders
if group_id.startswith('${'):
group_id = properties.get(group_id[2:-1], group_id)
if artifact_id.startswith('${'):
artifact_id = properties.get(artifact_id[2:-1], artifact_id)
if version.startswith('${'):
version = properties.get(version[2:-1], version)
deps_list.append(f"{group_id}:{artifact_id}@{version}")

deps_list_data = {"resolutions": deps_list, "patches": []}

return deps_list_data

except (IOError, ValueError, KeyError) as e:
logging.error(
"An error occurred while extracting dependencies from pom.xml file: %s",
str(e),
)
return {"resolutions": [], "patches": []}
20 changes: 17 additions & 3 deletions tool/github_repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,17 +81,29 @@ def process_package(
timeout=TIMEOUT,
)

elif pm == "maven":
# package is in the form of group_id:artifact_id@version -- we need all 3
name, version = package.split("@")
group_id, artifact_id = name.split(":")
result = subprocess.run(
["mvn", "help:evaluate", "-Dexpression=project.scm.url", f"-Dartifact={group_id}:{artifact_id}:{version}", "-q", "-DforceStdout"],
capture_output=True,
text=True,
check=True,
timeout=TIMEOUT,
)

else:
raise ValueError(f"Unsupported package manager: {pm}")

repo_info = result.stdout if result.stdout else result.stderr
# print(f"Repo info for {package}: {repo_info}")
c.execute(
"INSERT OR IGNORE INTO pkg_github_repo_output (package, github) VALUES (?,?)",
(package, repo_info),
)
conn.commit()


# TODO: this shouldn't just be yarn
except subprocess.TimeoutExpired:
logging.error(
"Yarn info command timed out after %s seconds for package %s",
Expand All @@ -100,10 +112,12 @@ def process_package(
)
repo_info = None

# TODO: this shouldn't just be yarn
except subprocess.CalledProcessError as e:
logging.error("Yarn info command failed for package %s: %s", package, e)
repo_info = None

# TODO: npm?
package = package.replace("@npm:", "@")

if (
Expand Down Expand Up @@ -136,7 +150,7 @@ def get_github_repo_url(folder, dep_list, pm):

print("Getting GitHub URLs of packages...")
total_packages_to_process = len(dep_list.get("resolutions", []))
# have not process patches

with tqdm(total=total_packages_to_process, desc="Getting GitHub URLs") as pbar:
for pkg_res in dep_list.get("resolutions"):
package = pkg_res
Expand Down
40 changes: 20 additions & 20 deletions tool/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,9 @@ def get_args():
parser.add_argument(
"-pm",
"--package-manager",
choices=["yarn-berry", "yarn-classic", "pnpm", "maven"],
required=True,
help="The package manager used in the project.",
choices=["yarn-classic", "yarn-berry", "pnpm"],
help="Specify the package manager used in the project",
)

arguments = parser.parse_args()
Expand Down Expand Up @@ -117,14 +117,12 @@ def get_lockfile(project_repo_name, release_version, package_manager):
package_manager (str): The package manager used in the project.

Returns:
str: The content of the lockfile.
str: The content of the lockfile or pom.xml.
str: The default branch of the project.
str: The name of the project repository.
"""

tool_config.setup_cache("demo")
# logging.info("Cache [demo_cache] setup complete")

logging.info("Getting lockfile for %s@%s", project_repo_name, release_version)
logging.info("Package manager: %s", package_manager)

Expand All @@ -134,24 +132,21 @@ def get_lockfile(project_repo_name, release_version, package_manager):
lockfile_name = "yarn.lock"
elif package_manager == "pnpm":
lockfile_name = "pnpm-lock.yaml"
elif package_manager == "maven":
lockfile_name = "pom.xml"
else:
logging.error("Invalid package manager: %s", package_manager)
raise ValueError("Invalid package manager.")

response = requests.get(
f"https://api.github.com/repos/{project_repo_name}/contents/{lockfile_name}?ref={release_version}",
headers=headers,
timeout=20,
)

file_url = f"https://raw.githubusercontent.com/{project_repo_name}/{release_version}/{lockfile_name}"
response = requests.get(file_url, headers=headers, timeout=20)

if response.status_code == 200:
data = response.json()
download_url = data.get("download_url")
yarn_lock_content = requests.get(download_url, timeout=60).text
print("Got the Yarn.lock file.")
file_content = response.text
print(f"Got the {lockfile_name} file.")
else:
logging.error("Failed to get yarn.lock.")
raise ValueError("Failed to get yarn.lock.")
logging.error(f"Failed to get {lockfile_name}.")
raise ValueError(f"Failed to get {lockfile_name}.")

repo_branch_api = f"https://api.github.com/repos/{project_repo_name}"
repo_branch_response = requests.get(repo_branch_api, headers=headers, timeout=20)
Expand All @@ -165,8 +160,7 @@ def get_lockfile(project_repo_name, release_version, package_manager):
else:
raise ValueError("Failed to get default branch")

return yarn_lock_content, default_branch, project_repo_name

return file_content, default_branch, project_repo_name

def get_deps(folder_path, project_repo_name, release_version, package_manager):
"""
Expand All @@ -190,6 +184,12 @@ def get_deps(folder_path, project_repo_name, release_version, package_manager):
deps_list_all = extract_deps.extract_deps_from_pnpm_mono(
folder_path, release_version, project_repo_name
)

elif package_manager == "maven":
pom_xml_content, _, _ = get_lockfile(
project_repo_name, release_version, package_manager
)
deps_list_all = extract_deps.extract_deps_from_maven(pom_xml_content)

# extract deps from lockfile
else:
Expand Down Expand Up @@ -251,7 +251,7 @@ def static_analysis_all(
)

static_results, errors = static_analysis.get_static_data(
folder_path, repo_url_info, check_match=check_match
folder_path, repo_url_info, package_manager, check_match=check_match
)
logging.info("Errors: %s", errors)

Expand Down
6 changes: 3 additions & 3 deletions tool/report_static.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,13 @@ def create_dataframe(data):

row = {
"package_name": package_name,
"deprecated_in_version": package_data["npm_package_info"].get(
"deprecated_in_version": package_data.get("package_info", {}).get(
"deprecated_in_version"
),
"provenance_in_version": package_data["npm_package_info"].get(
"provenance_in_version": package_data.get("package_info", {}).get(
"provenance_in_version"
),
"all_deprecated": package_data["npm_package_info"].get(
"all_deprecated": package_data.get("package_info", {}).get(
"all_deprecated", None
),
"github_url": github_exists_data.get(
Expand Down
Loading