Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/hot topic of the day #64

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
6 changes: 6 additions & 0 deletions analysis/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
"""
This package contains scripts to perform global analyses, i. e., analyses on the entire database which should be
typically performed periodically through a CRON job.
"""

__author__ = 'fccoelho'
32 changes: 32 additions & 0 deletions analysis/freqdist.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/usr/bin/env python
"""
This Script uses the sphinxsearch index to obtain a list of words and their frequencies
"""
__author__ = 'fccoelho'

import argparse
import os



def generate_freqdist(dic, conf, ind, freq):
"""
Generate freqdist using Sphinx's indexer
:param dic: name of the file in which to save the freqdist
:param conf: path to sphinx's configuration file
:param ind: name of the index to analyse
:param freq: frequency cutoff. Freqdist will contain only the `freq` most frequent words.
"""
os.system("indexer --buildstops {} {} --buildfreqs {} -c {}".format(dic, freq, ind, conf))


if __name__ == "__main__":
parser = argparse.ArgumentParser(description=("Creates a file with the most common words in the index."))

parser.add_argument("-d", "--dict", type=str, default="dict.txt", help="Dictionary file to be generated.")
parser.add_argument("-c", "--conf", type=str, help="Path to the sphinx configuration file.")
parser.add_argument("-i", "--index", type=str, help="Index to process.")
parser.add_argument("-f", "--frequency", type=int, default=100000, help="Frequency cutoff to use.")

args = parser.parse_args()
generate_freqdist(args.dict, args.conf, args.index, args.frequency)
86 changes: 86 additions & 0 deletions analysis/htod.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#!/usr/bin/env python
"""
This script calculates the "hot topic of the day". By looking at all articles published on a given day.
"""
__author__ = 'fccoelho'

import argparse
import datetime
from collections import Counter

import pymongo
from pypln.api import PyPLN, Document


Today = datetime.datetime.today()


def get_htod(d):
"""
Calculates "Hot token of the day"
:param d: day in 'YYYY-MM-DD' format
:return: Counter object with the counts of topics
"""
arts = fetch_articles(d)
total = Counter() # Use Counters to add up freqdists
for article in arts:
if "pypln_url" in article:
fd = get_doc_freqdist(article['pypln_url'])
total += Counter(dict(fd))

return total


def get_doc_freqdist(url):
"""
Get Freqdist for a given pypln document given its URL
:param url: URL of the Document
:return: Freqdist (list of lists)
"""
try:
doc = Document.from_url(url, (PYPLNUSER, PYPLNPASSWORD))
fd = doc.get_property("freqdist")
except RuntimeError as e:
fd = []
return fd


def fetch_articles(d=None):
"""
Fetch Articles published on a single Day
:param d: Day in 'YYYY-MM-DD' format
:return: articles (list of dictionaries)
"""
if d is None:
d = Today
else:
year, month, day = [int(i) for i in d.split('-')]
d = datetime.datetime(year, month, day)
end = d + datetime.timedelta(1)
arts = ARTICLES.find({"published": {"$gte": d, "$lt": end}}, fields=["published", "pypln_url"])
return arts


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description=("Calculate 'Hot Topics of the Day'.\nA ranking of the most mentioned subjects"))

parser.add_argument("-d", "--date", type=str, default="{}-{}-{}".format(Today.year, Today.month, Today.day),
help="Date to Analyse in YEAR-MO-DD format")
parser.add_argument("-h", '--host', type=str, help='MongoDB host to connect to')
parser.add_argument("-p", '--port', type=int, default=27017, help='MongoDB port to connect to')
parser.add_argument("--pyplhost", type=str, help="PyPLN host to use.")
parser.add_argument("--pyplnuser", type=str, help="PyPLN user.")
parser.add_argument("--pyplnpassword", type=str, help="PyPLN password")

args = parser.parse_args()

client = pymongo.MongoClient(args.host, args.port)
MCDB = client.MCDB
FEEDS = MCDB.feeds # Feed collection
ARTICLES = MCDB.articles # Article Collection
PYPLNUSER = args.pyplnuser
PYPLNPASSWORD = args.pyplnpassword
pypln = PyPLN(args.pyplnhost, (args.pyplnuser, args.pyplnpassword))

get_htod(args.date)
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
pandas
Flask-SQLAlchemy
Flask-WTF
beautifulsoup4
Expand All @@ -15,4 +16,4 @@ pysolr
mongo-connector
tweepy
pypln.api
geopy
geopy
51 changes: 51 additions & 0 deletions tests/test_analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#-*- coding:utf-8 -*-
u"""
Created on 24/04/14
by fccoelho
license: GPL V3 or Later
"""

__docformat__ = 'restructuredtext en'

import unittest
import datetime
from collections import Counter

import pymongo

from analysis import htod


class TestFreqdist(unittest.TestCase):
"""
Writing tests for this is hard because it takes a long time to calculate the freqdist file.
"""
pass

class TestHtod(unittest.TestCase):
def setUp(self):
htod.client = pymongo.MongoClient('localhost', 27017)
htod.ARTICLES = htod.client.MCDB.articles
htod.PYPLNUSER = "mediacloud2"
htod.PYPLNPASSWORD = "senha do mediacloud"

def test_fetch_today_articles(self):
d = "2014-02-14"
arts = list(htod.fetch_articles(d))
self.assertGreater(len(arts), 0)
for a in arts:
self.assertGreaterEqual(a["published"], datetime.datetime(2014, 02, 14))
self.assertLess(a["published"], datetime.datetime(2014, 02, 15))

def test_get_doc_freqdist(self):
d = "2014-02-14"
arts = [a for a in htod.fetch_articles(d) if "pypln_url" in a]
fd = htod.get_doc_freqdist(arts[0]['pypln_url'])
self.assertIsInstance(fd, list)

def test_get_htod(self):
d = "2014-02-14"
count = htod.get_htod(d)
self.assertIsInstance(count, Counter)
self.assertGreater(len(count), 0)