Skip to content

Commit

Permalink
Merge branch 'release/0.01'
Browse files Browse the repository at this point in the history
  • Loading branch information
wschuell committed Jan 15, 2021
2 parents 2dbb3ab + 5c8a81a commit 80e5bff
Show file tree
Hide file tree
Showing 6 changed files with 431 additions and 96 deletions.
24 changes: 22 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,11 +1,31 @@
dist: focal
language: python
services:
- postgresql
env:
global:
- PGVER=12
- PGPORT=5432
addons:
postgresql: "12"
apt:
packages:
- postgresql-12
- postgresql-client-12
# services:
# - postgresql
# - docker
python:
- "3.7"
before_install:
- sudo apt-get update -qq
- sudo apt-get install -y libsqlite3-dev
- sqlite3 --version
- sudo apt-get update
- sudo cp /etc/postgresql/12/main/pg_hba.conf pg_hba.conf
# - sudo apt-get --yes remove postgresql\*
- sudo apt-get install -y postgresql-12 postgresql-client-12
- sudo sed -i 's/port = 5433/port = 5432/' /etc/postgresql/12/main/postgresql.conf
- sudo cp pg_hba.conf /etc/postgresql/12/main/pg_hba.conf
- sudo service postgresql restart 12
before_script:
- psql -c 'create database travis_ci_test_repo_tools;' -U postgres
install:
Expand Down
3 changes: 2 additions & 1 deletion repo_tools/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
from . import repo_crawler
from . import repo_database
from . import repo_database
from . import misc
2 changes: 1 addition & 1 deletion repo_tools/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.0.1'
__version__ = '0.0.1' #
28 changes: 28 additions & 0 deletions repo_tools/misc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import os
import datetime

def get_packages_from_crates(conn,limit=None):
'''
From a connection to a crates.io database, output the list of packages as expected by RepoCrawler.add_packages()
package id, package name, created_at (datetime.datetime),repo_url
'''

cursor = conn.cursor()


if limit is not None:
if not isinstance(limit,int):
raise ValueError('limit should be an integer, given {}'.format(limit))
else:
limit_str = ' LIMIT {}'.format(limit)
else:
limit_str = ''


cursor.execute('''
SELECT id,name,created_at,repository FROM crates {}
;'''.format(limit_str))



return cursor.fetchall()
160 changes: 118 additions & 42 deletions repo_tools/repo_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@


from .repo_database import Database
from . import misc

logger = logging.getLogger(__name__)
ch = logging.StreamHandler()
Expand Down Expand Up @@ -61,7 +62,7 @@ def __init__(self,folder='.',ssh_mode=False,ssh_key=os.path.join(os.environ['HOM
db_folder = self.folder
self.set_db(db_folder=db_folder,**db_cfg)

def add_list(self,repo_list,source,source_urlroot=None,cloned=False):
def add_list(self,repo_list,source,source_urlroot=None,cloned=False): # DEPRECATED
'''
Behaving like an ordered set, if performance becomes an issue it could be useful to use OrderedSet implementation
Or simply code an option to disable checks
Expand All @@ -74,7 +75,7 @@ def add_list(self,repo_list,source,source_urlroot=None,cloned=False):
self.db.register_source(source=source,source_urlroot=source_urlroot)

for r in repo_list:
self.db.register_url(repo_url=r,source=source)
self.db.register_url(repo_url=r,source=source,clean_info=self.clean_url(r))
try:
r_f = self.repo_formatting(r,source_urlroot)
except RepoSyntaxError:
Expand All @@ -85,7 +86,7 @@ def add_list(self,repo_list,source,source_urlroot=None,cloned=False):
repo_id = self.db.get_repo_id(name=repo,owner=owner,source=source)
if cloned:
self.set_init_dl(repo=repo,owner=owner,source=source,repo_id=repo_id)
self.db.update_url(source=source,repo_url=r,repo_id=repo_id)
# self.db.update_url(source=source,repo_url=r,repo_id=repo_id) # calls deprecated function, need to adapt to register_urls

# if r_f not in self.repo_list:
# self.repo_list.append(r_f)
Expand All @@ -103,27 +104,61 @@ def set_db(self,db=None,**db_cfg):



def repo_formatting(self,repo,source_urlroot):
def repo_formatting(self,repo,source_urlroot,output_cleaned_url=False,raise_error=False):
'''
Formatting repositories so that they match the expected syntax 'user/project'
'''
r = copy.copy(repo)
if source_urlroot not in r:
raise RepoSyntaxError('Repo {} has not expected source {}.'.format(repo,source_urlroot))
for start_str in [
'{}/'.format(source_urlroot),
'https://{}/'.format(source_urlroot),
'http://{}/'.format(source_urlroot),
'https://www.{}/'.format(source_urlroot),
'http://www.{}/'.format(source_urlroot),
]:
if r.startswith(start_str):
r = '/'.join(r.split('/')[3:])
break

if source_urlroot in r:
raise RepoSyntaxError('Repo {} has not expected syntax for source {}.'.format(repo,source_urlroot))

r = r.replace('//','/')
if r.endswith('/'):
r = r[:-1]
if r.startswith('/'):
r = r[1:]
if r.endswith('.git'):
r = r[:-4]
if len(r.split('/')) != 2:
if (raise_error and len(r.split('/')) != 2):
raise RepoSyntaxError('Repo has not expected syntax "user/project" or prefixed with {}:{}. Please fix input or update the repo_formatting method.'.format(source_urlroot,repo))
r = '/'.join(r.split('/')[:2])
return r
if '' in r.split('/'):
raise ValueError('Critical syntax error for repository url: {}, parsed {}'.format(repo,r))
if output_cleaned_url:
return 'https://{}/{}'.format(source_urlroot,r)
else:
return r

def clean_url(self,url):
'''
getting a clean url based on what is available as sources, using source_urlroot values
returns clean_url,source_id
'''
if url is None:
return None

if not hasattr(self,'url_roots'):
self.db.cursor.execute('SELECT id,url_root FROM sources WHERE url_root IS NOT NULL;')
self.url_roots = list(self.db.cursor.fetchall())
for ur_id,ur in self.url_roots:
try:
return self.repo_formatting(repo=url,source_urlroot=ur,output_cleaned_url=True),ur_id
except RepoSyntaxError:
continue
return None,None

def list_missing_repos(self):
'''
Expand Down Expand Up @@ -185,6 +220,45 @@ def add_all_from_folder(self,clean=True,rename=True):
self.add_list(repo_list=repos,source=source,cloned=True)
self.logger.info('Found {} repositories for source {}'.format(len(repos),source))


def fill_packages(self,package_list,source,force=False,clean_urls=True):
'''
adds repositories from a package repository database (eg crates)
syntax of package list:
package id (in source), package name, created_at (datetime.datetime),repo_url
see .misc for wrappers
'''

if not force:
if self.db.db_type == 'postgres':
self.db.cursor.execute('SELECT * FROM packages WHERE source_id=(SELECT id FROM sources WHERE name=%s) LIMIT 1;',(source,))
else:
self.db.cursor.execute('SELECT * FROM packages WHERE source_id=(SELECT id FROM sources WHERE name=?) LIMIT 1;',(source,))
sample_package = self.db.cursor.fetchone()
if sample_package is not None:
self.logger.info('Skipping packages from {}'.format(source))
else:
self.fill_packages(package_list=package_list,source=source,force=True,clean_urls=clean_urls)
else:
self.logger.info('Filling packages from {}'.format(source))
self.db.register_source(source)
if clean_urls:
self.db.register_urls(source=source,url_list=[(p[3],*self.clean_url(p[3])) for p in package_list if p[3] is not None])
else:
self.db.register_urls(source=source,url_list=[p[3] for p in package_list if p[3] is not None])

self.logger.info('Filled URLs')


self.db.register_repositories(repo_info_list=[(self.clean_url(p[3])[1],self.clean_url(p[3])[0].split('/')[-2],self.clean_url(p[3])[0].split('/')[-1],self.clean_url(p[3])[0]) for p in package_list if p[3] is not None and self.clean_url(p[3])[0] is not None])
self.logger.info('Filled repositories')

self.db.register_packages(source=source,package_list=package_list)
self.logger.info('Filled packages')



def build_url(self,name,owner,source_urlroot):
'''
building url, depending on mode (ssh or https)
Expand Down Expand Up @@ -308,41 +382,43 @@ def list_commits(self,name,source,owner,basic_info_only=False,repo_id=None,after
repo_id = self.db.get_repo_id(source=source,name=name,owner=owner)
# repo_obj.walk(repo.head.target, pygit2.GIT_SORT_TOPOLOGICAL | pygit2.GIT_SORT_REVERSE)
# for commit in repo_obj.walk(repo_obj.head.target, pygit2.GIT_SORT_TIME | pygit2.GIT_SORT_REVERSE):
for commit in repo_obj.walk(repo_obj.head.target, pygit2.GIT_SORT_TIME):
if after_time is not None and commit.commit_time<after_time:
break
if basic_info_only:
yield {
'author_email':commit.author.email,
'author_name':commit.author.name,
'time':commit.commit_time,
'time_offset':commit.commit_time_offset,
'sha':commit.hex,
'parents':[pid.hex for pid in commit.parent_ids],
'repo_id':repo_id,
}
else:
if commit.parents:
diff_obj = repo_obj.diff(commit.parents[0],commit)# Inverted order wrt the expected one, to have expected values for insertions and deletions
insertions = diff_obj.stats.insertions
deletions = diff_obj.stats.deletions

if not repo_obj.is_empty:
for commit in repo_obj.walk(repo_obj.head.target, pygit2.GIT_SORT_TIME):
if after_time is not None and commit.commit_time<after_time:
break
if basic_info_only:
yield {
'author_email':commit.author.email,
'author_name':commit.author.name,
'time':commit.commit_time,
'time_offset':commit.commit_time_offset,
'sha':commit.hex,
'parents':[pid.hex for pid in commit.parent_ids],
'repo_id':repo_id,
}
else:
diff_obj = commit.tree.diff_to_tree()
# re-inverting insertions and deletions, to get expected values
deletions = diff_obj.stats.insertions
insertions = diff_obj.stats.deletions
yield {
'author_email':commit.author.email,
'author_name':commit.author.name,
'time':commit.commit_time,
'time_offset':commit.commit_time_offset,
'sha':commit.hex,
'parents':[pid.hex for pid in commit.parent_ids],
'insertions':insertions,
'deletions':deletions,
'total':insertions+deletions,
'repo_id':repo_id,
}
if commit.parents:
diff_obj = repo_obj.diff(commit.parents[0],commit)# Inverted order wrt the expected one, to have expected values for insertions and deletions
insertions = diff_obj.stats.insertions
deletions = diff_obj.stats.deletions
else:
diff_obj = commit.tree.diff_to_tree()
# re-inverting insertions and deletions, to get expected values
deletions = diff_obj.stats.insertions
insertions = diff_obj.stats.deletions
yield {
'author_email':commit.author.email,
'author_name':commit.author.name,
'time':commit.commit_time,
'time_offset':commit.commit_time_offset,
'sha':commit.hex,
'parents':[pid.hex for pid in commit.parent_ids],
'insertions':insertions,
'deletions':deletions,
'total':insertions+deletions,
'repo_id':repo_id,
}


def fill_commit_info(self,force=False,all_commits=False):
Expand All @@ -357,9 +433,9 @@ def fill_commit_info(self,force=False,all_commits=False):
last_dl = self.db.cursor.fetchone()[0]

if all_commits:
option = 'basicinfo_dict'
option = 'basicinfo_dict_cloned'
else:
option = 'basicinfo_dict_time'
option = 'basicinfo_dict_time_cloned'

if force or (last_fu is None) or (last_dl is not None and last_fu<last_dl):

Expand Down
Loading

0 comments on commit 80e5bff

Please sign in to comment.