From 9dcee06b1cae24d6866a064750e93499ec1fa1ce Mon Sep 17 00:00:00 2001 From: Bowen Fu Date: Tue, 22 Aug 2017 13:47:48 +0800 Subject: [PATCH 01/10] '2to3' --- quepy/cntagger.py | 74 ++++++++++++++++++++++++++++++++++++++++++++ quepy/jiebatagger.py | 68 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 142 insertions(+) create mode 100644 quepy/cntagger.py create mode 100644 quepy/jiebatagger.py diff --git a/quepy/cntagger.py b/quepy/cntagger.py new file mode 100644 index 0000000..0b42946 --- /dev/null +++ b/quepy/cntagger.py @@ -0,0 +1,74 @@ +# coding: utf-8 + +# Copyright (c) 2012, Machinalis S.R.L. +# This file is part of quepy and is distributed under the Modified BSD License. +# You should have received a copy of license in the LICENSE file. +# +# Authors: Rafael Carrascosa +# Gonzalo Garcia Berrotaran + +import logging + +from quepy import settings +from quepy.encodingpolicy import assert_valid_encoding + +logger = logging.getLogger("quepy.tagger") +PENN_TAGSET = set(u"$ `` '' ( ) , -- . : CC CD DT EX FW IN JJ JJR JJS LS MD " + "NN NNP NNPS NNS PDT POS PRP PRP$ RB RBR RBS RP SYM TO UH " + "VB VBD VBG VBN VBP VBZ WDT WP WP$ WRB".split()) + + +class TaggingError(Exception): + """ + Error parsing tagger's output. + """ + pass + + +class Word(object): + """ + Representation of a tagged word. + Contains *token*, *lemma*, *pos tag* and optionally a *probability* of + that tag. + """ + _encoding_attrs = u"token lemma pos".split() + _attrs = _encoding_attrs + [u"prob"] + + def __init__(self, token, lemma=None, pos=None, prob=None): + self.pos = pos + self.prob = prob + self.lemma = lemma + self.token = token + + def __setattr__(self, name, value): + if name in self._encoding_attrs and value is not None: + assert_valid_encoding(value) + object.__setattr__(self, name, value) + + def __unicode__(self): + attrs = (getattr(self, name, u"-") for name in self._attrs) + return u"|".join(str(x) for x in attrs) + + def __repr__(self): + return unicode(self) + + +def get_cntagger(): + """ + Return a tagging function given some app settings. + `Settings` is the settings module of an app. + The returned value is a function that receives a unicode string and returns + a list of `Word` instances. + """ + from quepy.jiebatagger import run_jiebatagger + tagger_function = lambda x: run_jiebaagger(x) + + def wrapper(string): + assert_valid_encoding(string) + words = tagger_function(string) + for word in words: + if word.pos not in PENN_TAGSET: + logger.warning("Tagger emmited a non-penn " + "POS tag {!r}".format(word.pos)) + return words + return wrapper diff --git a/quepy/jiebatagger.py b/quepy/jiebatagger.py new file mode 100644 index 0000000..a9a238d --- /dev/null +++ b/quepy/jiebatagger.py @@ -0,0 +1,68 @@ +# coding: utf-8 + +# Copyright (c) 2012, Machinalis S.R.L. +# This file is part of quepy and is distributed under the Modified BSD License. +# You should have received a copy of license in the LICENSE file. +# +# Authors: Rafael Carrascosa +# Gonzalo Garcia Berrotaran + +""" +Tagging using Jieba. +""" + +# Requiered data files are: +# - "averaged_perceptron_tagger" in Models +# - "wordnet" in Corpora + +import jieba +from quepy.tagger import Word +from quepy.encodingpolicy import assert_valid_encoding + +_penn_to_morphy_tag = {} + + +def penn_to_morphy_tag(tag): + assert_valid_encoding(tag) + + for penn, morphy in _penn_to_morphy_tag.iteritems(): + if tag.startswith(penn): + return morphy + return None + + +def run_jiebatagger(string): + """ + Runs jieba tagger on `string` and returns a list of + :class:`quepy.tagger.Word` objects. + """ + assert_valid_encoding(string) + global _penn_to_morphy_tag + + from nltk.corpus import wordnet + + if not _penn_to_morphy_tag: + _penn_to_morphy_tag = { + u'NN': wordnet.NOUN, + u'JJ': wordnet.ADJ, + u'VB': wordnet.VERB, + u'RB': wordnet.ADV, + } + + # Recommended tokenizer doesn't handle non-ascii characters very well + #tokens = jieba.word_tokenize(string) + token_tags = jieba.posseg.cut(string) + + words = [] + for token, pos in token_tags: + word = Word(token) + # Eliminates stuff like JJ|CC + # decode ascii because they are the penn-like POS tags (are ascii). + word.pos = pos.split("|")[0].decode("ascii") + + mtag = penn_to_morphy_tag(word.pos) + word.lemma = None + + words.append(word) + + return words From 68025540863a0d190978af9b0230b49aca036438 Mon Sep 17 00:00:00 2001 From: Bowen Fu Date: Tue, 22 Aug 2017 13:48:05 +0800 Subject: [PATCH 02/10] '2to3' --- docs/conf.py | 16 +- docs/conf.py.bak | 242 +++++++++++++++ examples/dbpedia/dbpedia/__init__.py | 16 +- examples/dbpedia/dbpedia/__init__.py.bak | 21 ++ examples/dbpedia/dbpedia/basic.py | 2 +- examples/dbpedia/dbpedia/basic.py.bak | 105 +++++++ examples/dbpedia/dbpedia/country.py | 2 +- examples/dbpedia/dbpedia/country.py.bak | 99 +++++++ examples/dbpedia/dbpedia/movies.py | 2 +- examples/dbpedia/dbpedia/movies.py.bak | 184 ++++++++++++ examples/dbpedia/dbpedia/music.py | 2 +- examples/dbpedia/dbpedia/music.py.bak | 97 ++++++ examples/dbpedia/dbpedia/people.py | 2 +- examples/dbpedia/dbpedia/people.py.bak | 66 +++++ examples/dbpedia/dbpedia/populated_place.py | 2 +- .../dbpedia/dbpedia/populated_place.py.bak | 60 ++++ examples/dbpedia/dbpedia/settings.py | 2 +- examples/dbpedia/dbpedia/settings.py.bak | 34 +++ examples/dbpedia/dbpedia/tvshows.py | 2 +- examples/dbpedia/dbpedia/tvshows.py.bak | 124 ++++++++ examples/dbpedia/dbpedia/writers.py | 2 +- examples/dbpedia/dbpedia/writers.py.bak | 69 +++++ examples/dbpedia/main.py | 38 +-- examples/dbpedia/main.py.bak | 212 +++++++++++++ examples/freebase/freebase/__init__.py | 14 +- examples/freebase/freebase/__init__.py.bak | 14 + examples/freebase/freebase/basic.py | 2 +- examples/freebase/freebase/basic.py.bak | 54 ++++ examples/freebase/freebase/country.py | 2 +- examples/freebase/freebase/country.py.bak | 94 ++++++ examples/freebase/freebase/movies.py | 2 +- examples/freebase/freebase/movies.py.bak | 184 ++++++++++++ examples/freebase/freebase/music.py | 2 +- examples/freebase/freebase/music.py.bak | 97 ++++++ examples/freebase/freebase/people.py | 2 +- examples/freebase/freebase/people.py.bak | 65 ++++ examples/freebase/freebase/tvshows.py | 2 +- examples/freebase/freebase/tvshows.py.bak | 112 +++++++ examples/freebase/freebase/writers.py | 2 +- examples/freebase/freebase/writers.py.bak | 69 +++++ examples/freebase/main.py | 14 +- examples/freebase/main.py.bak | 67 +++++ quepy/cntagger.py | 12 +- quepy/cntagger.py.bak | 74 +++++ quepy/dot_generation.py | 66 ++--- quepy/dot_generation.py.bak | 89 ++++++ quepy/dsl.py | 6 +- quepy/dsl.py.bak | 106 +++++++ quepy/encodingpolicy.py | 12 +- quepy/encodingpolicy.py.bak | 48 +++ quepy/expression.py | 2 +- quepy/expression.py.bak | 210 +++++++++++++ quepy/generation.py | 2 +- quepy/generation.py.bak | 38 +++ quepy/jiebatagger.py | 10 +- quepy/jiebatagger.py.bak | 68 +++++ quepy/mql_generation.py | 8 +- quepy/mql_generation.py.bak | 141 +++++++++ quepy/nltktagger.py | 10 +- quepy/nltktagger.py.bak | 81 +++++ quepy/quepyapp.py | 20 +- quepy/quepyapp.py.bak | 162 ++++++++++ quepy/settings.py | 2 +- quepy/settings.py.bak | 31 ++ quepy/sparql_generation.py | 30 +- quepy/sparql_generation.py.bak | 70 +++++ quepy/tagger.py | 12 +- quepy/tagger.py.bak | 74 +++++ tests/random_expression.py | 12 +- tests/random_expression.py.bak | 74 +++++ tests/test_dot_generation.py | 12 +- tests/test_dot_generation.py.bak | 87 ++++++ tests/test_dsl.py | 50 ++-- tests/test_dsl.py.bak | 97 ++++++ tests/test_expressions.py | 16 +- tests/test_expressions.py.bak | 279 ++++++++++++++++++ tests/test_mql_generation.py | 10 +- tests/test_mql_generation.py.bak | 68 +++++ tests/test_nltktagger.py | 2 +- tests/test_nltktagger.py.bak | 30 ++ tests/test_parsing.py | 30 +- tests/test_parsing.py.bak | 128 ++++++++ tests/test_quepyapp.py | 4 +- tests/test_quepyapp.py.bak | 48 +++ tests/test_sparql_generation.py | 12 +- tests/test_sparql_generation.py.bak | 102 +++++++ tests/test_tagger.py | 50 ++-- tests/test_tagger.py.bak | 67 +++++ tests/testapp/__init__.py | 2 +- tests/testapp/__init__.py.bak | 15 + 90 files changed, 4517 insertions(+), 261 deletions(-) create mode 100644 docs/conf.py.bak create mode 100644 examples/dbpedia/dbpedia/__init__.py.bak create mode 100644 examples/dbpedia/dbpedia/basic.py.bak create mode 100644 examples/dbpedia/dbpedia/country.py.bak create mode 100644 examples/dbpedia/dbpedia/movies.py.bak create mode 100644 examples/dbpedia/dbpedia/music.py.bak create mode 100644 examples/dbpedia/dbpedia/people.py.bak create mode 100644 examples/dbpedia/dbpedia/populated_place.py.bak create mode 100644 examples/dbpedia/dbpedia/settings.py.bak create mode 100644 examples/dbpedia/dbpedia/tvshows.py.bak create mode 100644 examples/dbpedia/dbpedia/writers.py.bak create mode 100644 examples/dbpedia/main.py.bak create mode 100644 examples/freebase/freebase/__init__.py.bak create mode 100644 examples/freebase/freebase/basic.py.bak create mode 100644 examples/freebase/freebase/country.py.bak create mode 100644 examples/freebase/freebase/movies.py.bak create mode 100644 examples/freebase/freebase/music.py.bak create mode 100644 examples/freebase/freebase/people.py.bak create mode 100644 examples/freebase/freebase/tvshows.py.bak create mode 100644 examples/freebase/freebase/writers.py.bak create mode 100644 examples/freebase/main.py.bak create mode 100644 quepy/cntagger.py.bak create mode 100644 quepy/dot_generation.py.bak create mode 100644 quepy/dsl.py.bak create mode 100644 quepy/encodingpolicy.py.bak create mode 100644 quepy/expression.py.bak create mode 100644 quepy/generation.py.bak create mode 100644 quepy/jiebatagger.py.bak create mode 100644 quepy/mql_generation.py.bak create mode 100644 quepy/nltktagger.py.bak create mode 100644 quepy/quepyapp.py.bak create mode 100644 quepy/settings.py.bak create mode 100644 quepy/sparql_generation.py.bak create mode 100644 quepy/tagger.py.bak create mode 100644 tests/random_expression.py.bak create mode 100644 tests/test_dot_generation.py.bak create mode 100644 tests/test_dsl.py.bak create mode 100644 tests/test_expressions.py.bak create mode 100644 tests/test_mql_generation.py.bak create mode 100644 tests/test_nltktagger.py.bak create mode 100644 tests/test_parsing.py.bak create mode 100644 tests/test_quepyapp.py.bak create mode 100644 tests/test_sparql_generation.py.bak create mode 100644 tests/test_tagger.py.bak create mode 100644 tests/testapp/__init__.py.bak diff --git a/docs/conf.py b/docs/conf.py index 10cf615..434fd4c 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -40,8 +40,8 @@ master_doc = 'index' # General information about the project. -project = u'Quepy' -copyright = u'2012, Machinalis' +project = 'Quepy' +copyright = '2012, Machinalis' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -183,8 +183,8 @@ # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, author, documentclass [howto/manual]). latex_documents = [ - ('index', 'Quepy.tex', u'Quepy Documentation', - u'Machinalis', 'manual'), + ('index', 'Quepy.tex', 'Quepy Documentation', + 'Machinalis', 'manual'), ] # The name of an image file (relative to this directory) to place at the top of @@ -213,8 +213,8 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - ('index', 'quepy', u'Quepy Documentation', - [u'Machinalis'], 1) + ('index', 'quepy', 'Quepy Documentation', + ['Machinalis'], 1) ] # If true, show URL addresses after external links. @@ -227,8 +227,8 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - ('index', 'Quepy', u'Quepy Documentation', - u'Machinalis', 'Quepy', 'One line description of project.', + ('index', 'Quepy', 'Quepy Documentation', + 'Machinalis', 'Quepy', 'One line description of project.', 'Miscellaneous'), ] diff --git a/docs/conf.py.bak b/docs/conf.py.bak new file mode 100644 index 0000000..10cf615 --- /dev/null +++ b/docs/conf.py.bak @@ -0,0 +1,242 @@ +# -*- coding: utf-8 -*- +# +# Quepy documentation build configuration file, created by +# sphinx-quickstart on Mon Nov 5 14:12:47 2012. +# +# This file is execfile()d with the current directory set to its containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import sys, os + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +#sys.path.insert(0, os.path.abspath('.')) + +# -- General configuration ----------------------------------------------------- + +# If your documentation needs a minimal Sphinx version, state it here. +#needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be extensions +# coming with Sphinx (named 'sphinx.ext.*') or your custom ones. +extensions = ['sphinx.ext.autodoc'] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix of source filenames. +source_suffix = '.rst' + +# The encoding of source files. +#source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = u'Quepy' +copyright = u'2012, Machinalis' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = '0.1' +# The full version, including alpha/beta/rc tags. +release = '0.1' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +#language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns = ['_build'] + +# The reST default role (used for this markup: `text`) to use for all documents. +#default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +#add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + + +# -- Options for HTML output --------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +html_theme = 'default' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +#html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +#html_theme_path = [] + +# The name for this set of Sphinx documents. If None, it defaults to +# " v documentation". +#html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +#html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +#html_logo = None + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +#html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +#html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +#html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +#html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +#html_additional_pages = {} + +# If false, no module index is generated. +#html_domain_indices = True + +# If false, no index is generated. +#html_use_index = True + +# If true, the index is split into individual pages for each letter. +#html_split_index = False + +# If true, links to the reST sources are added to the pages. +#html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +#html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +#html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +#html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +#html_file_suffix = None + +# Output file base name for HTML help builder. +htmlhelp_basename = 'Quepydoc' + + +# -- Options for LaTeX output -------------------------------------------------- + +latex_elements = { +# The paper size ('letterpaper' or 'a4paper'). +#'papersize': 'letterpaper', + +# The font size ('10pt', '11pt' or '12pt'). +#'pointsize': '10pt', + +# Additional stuff for the LaTeX preamble. +#'preamble': '', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, author, documentclass [howto/manual]). +latex_documents = [ + ('index', 'Quepy.tex', u'Quepy Documentation', + u'Machinalis', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +#latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# If true, show page references after internal links. +#latex_show_pagerefs = False + +# If true, show URL addresses after external links. +#latex_show_urls = False + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +#latex_domain_indices = True + + +# -- Options for manual page output -------------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + ('index', 'quepy', u'Quepy Documentation', + [u'Machinalis'], 1) +] + +# If true, show URL addresses after external links. +#man_show_urls = False + + +# -- Options for Texinfo output ------------------------------------------------ + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + ('index', 'Quepy', u'Quepy Documentation', + u'Machinalis', 'Quepy', 'One line description of project.', + 'Miscellaneous'), +] + +# Documents to append as an appendix to all manuals. +#texinfo_appendices = [] + +# If false, no module index is generated. +#texinfo_domain_indices = True + +# How to display URL addresses: 'footnote', 'no', or 'inline'. +#texinfo_show_urls = 'footnote' diff --git a/examples/dbpedia/dbpedia/__init__.py b/examples/dbpedia/dbpedia/__init__.py index 47605e7..4276864 100644 --- a/examples/dbpedia/dbpedia/__init__.py +++ b/examples/dbpedia/dbpedia/__init__.py @@ -11,11 +11,11 @@ DBpedia quepy. """ -from basic import * -from music import * -from movies import * -from people import * -from country import * -from populated_place import * -from tvshows import * -from writers import * +from .basic import * +from .music import * +from .movies import * +from .people import * +from .country import * +from .populated_place import * +from .tvshows import * +from .writers import * diff --git a/examples/dbpedia/dbpedia/__init__.py.bak b/examples/dbpedia/dbpedia/__init__.py.bak new file mode 100644 index 0000000..47605e7 --- /dev/null +++ b/examples/dbpedia/dbpedia/__init__.py.bak @@ -0,0 +1,21 @@ +# coding: utf-8 + +# Copyright (c) 2012, Machinalis S.R.L. +# This file is part of quepy and is distributed under the Modified BSD License. +# You should have received a copy of license in the LICENSE file. +# +# Authors: Rafael Carrascosa +# Gonzalo Garcia Berrotaran + +""" +DBpedia quepy. +""" + +from basic import * +from music import * +from movies import * +from people import * +from country import * +from populated_place import * +from tvshows import * +from writers import * diff --git a/examples/dbpedia/dbpedia/basic.py b/examples/dbpedia/dbpedia/basic.py index 16632f6..c972072 100644 --- a/examples/dbpedia/dbpedia/basic.py +++ b/examples/dbpedia/dbpedia/basic.py @@ -15,7 +15,7 @@ from quepy.parsing import Lemma, Pos, QuestionTemplate, Token, Particle, \ Lemmas from quepy.dsl import HasKeyword, IsRelatedTo, HasType -from dsl import DefinitionOf, LabelOf, IsPlace, \ +from .dsl import DefinitionOf, LabelOf, IsPlace, \ UTCof, LocationOf diff --git a/examples/dbpedia/dbpedia/basic.py.bak b/examples/dbpedia/dbpedia/basic.py.bak new file mode 100644 index 0000000..16632f6 --- /dev/null +++ b/examples/dbpedia/dbpedia/basic.py.bak @@ -0,0 +1,105 @@ +# coding: utf-8 + +# Copyright (c) 2012, Machinalis S.R.L. +# This file is part of quepy and is distributed under the Modified BSD License. +# You should have received a copy of license in the LICENSE file. +# +# Authors: Rafael Carrascosa +# Gonzalo Garcia Berrotaran + +""" +Basic questions for DBpedia. +""" + +from refo import Group, Plus, Question +from quepy.parsing import Lemma, Pos, QuestionTemplate, Token, Particle, \ + Lemmas +from quepy.dsl import HasKeyword, IsRelatedTo, HasType +from dsl import DefinitionOf, LabelOf, IsPlace, \ + UTCof, LocationOf + + +# Openings +LISTOPEN = Lemma("list") | Lemma("name") + + +class Thing(Particle): + regex = Question(Pos("JJ")) + (Pos("NN") | Pos("NNP") | Pos("NNS")) |\ + Pos("VBN") + + def interpret(self, match): + return HasKeyword(match.words.tokens) + + +class WhatIs(QuestionTemplate): + """ + Regex for questions like "What is a blowtorch + Ex: "What is a car" + "What is Seinfield?" + """ + + regex = Lemma("what") + Lemma("be") + Question(Pos("DT")) + \ + Thing() + Question(Pos(".")) + + def interpret(self, match): + label = DefinitionOf(match.thing) + + return label, "define" + + +class ListEntity(QuestionTemplate): + """ + Regex for questions like "List Microsoft software" + """ + + entity = Group(Pos("NNP"), "entity") + target = Group(Pos("NN") | Pos("NNS"), "target") + regex = LISTOPEN + entity + target + + def interpret(self, match): + entity = HasKeyword(match.entity.tokens) + target_type = HasKeyword(match.target.lemmas) + target = HasType(target_type) + IsRelatedTo(entity) + label = LabelOf(target) + + return label, "enum" + + +class WhatTimeIs(QuestionTemplate): + """ + Regex for questions about the time + Ex: "What time is it in Cordoba" + """ + + nouns = Plus(Pos("NN") | Pos("NNS") | Pos("NNP") | Pos("NNPS")) + place = Group(nouns, "place") + openings = (Lemma("what") + + ((Token("is") + Token("the") + Question(Lemma("current")) + + Question(Lemma("local")) + Lemma("time")) | + (Lemma("time") + Token("is") + Token("it")))) | \ + Lemma("time") + regex = openings + Pos("IN") + place + Question(Pos(".")) + + def interpret(self, match): + place = HasKeyword(match.place.lemmas.title()) + IsPlace() + utc_offset = UTCof(place) + + return utc_offset, "time" + + +class WhereIsQuestion(QuestionTemplate): + """ + Ex: "where in the world is the Eiffel Tower" + """ + + thing = Group(Plus(Pos("IN") | Pos("NP") | Pos("NNP") | Pos("NNPS")), + "thing") + regex = Lemma("where") + Question(Lemmas("in the world")) + Lemma("be") + \ + Question(Pos("DT")) + thing + Question(Pos(".")) + + def interpret(self, match): + thing = HasKeyword(match.thing.tokens) + location = LocationOf(thing) + location_name = LabelOf(location) + + return location_name, "enum" diff --git a/examples/dbpedia/dbpedia/country.py b/examples/dbpedia/dbpedia/country.py index 509de27..c25397b 100644 --- a/examples/dbpedia/dbpedia/country.py +++ b/examples/dbpedia/dbpedia/country.py @@ -14,7 +14,7 @@ from refo import Plus, Question from quepy.dsl import HasKeyword from quepy.parsing import Lemma, Pos, QuestionTemplate, Token, Particle -from dsl import IsCountry, IncumbentOf, CapitalOf, \ +from .dsl import IsCountry, IncumbentOf, CapitalOf, \ LabelOf, LanguageOf, PopulationOf, PresidentOf diff --git a/examples/dbpedia/dbpedia/country.py.bak b/examples/dbpedia/dbpedia/country.py.bak new file mode 100644 index 0000000..509de27 --- /dev/null +++ b/examples/dbpedia/dbpedia/country.py.bak @@ -0,0 +1,99 @@ +# coding: utf-8 + +# Copyright (c) 2012, Machinalis S.R.L. +# This file is part of quepy and is distributed under the Modified BSD License. +# You should have received a copy of license in the LICENSE file. +# +# Authors: Rafael Carrascosa +# Gonzalo Garcia Berrotaran + +""" +Country related regex +""" + +from refo import Plus, Question +from quepy.dsl import HasKeyword +from quepy.parsing import Lemma, Pos, QuestionTemplate, Token, Particle +from dsl import IsCountry, IncumbentOf, CapitalOf, \ + LabelOf, LanguageOf, PopulationOf, PresidentOf + + +class Country(Particle): + regex = Plus(Pos("DT") | Pos("NN") | Pos("NNS") | Pos("NNP") | Pos("NNPS")) + + def interpret(self, match): + name = match.words.tokens.title() + return IsCountry() + HasKeyword(name) + + +class PresidentOfQuestion(QuestionTemplate): + """ + Regex for questions about the president of a country. + Ex: "Who is the president of Argentina?" + """ + + regex = Pos("WP") + Token("is") + Question(Pos("DT")) + \ + Lemma("president") + Pos("IN") + Country() + Question(Pos(".")) + + def interpret(self, match): + president = PresidentOf(match.country) + incumbent = IncumbentOf(president) + label = LabelOf(incumbent) + + return label, "enum" + + +class CapitalOfQuestion(QuestionTemplate): + """ + Regex for questions about the capital of a country. + Ex: "What is the capital of Bolivia?" + """ + + opening = Lemma("what") + Token("is") + regex = opening + Pos("DT") + Lemma("capital") + Pos("IN") + \ + Question(Pos("DT")) + Country() + Question(Pos(".")) + + def interpret(self, match): + capital = CapitalOf(match.country) + label = LabelOf(capital) + return label, "enum" + + +# FIXME: the generated query needs FILTER isLiteral() to the head +# because dbpedia sometimes returns different things +class LanguageOfQuestion(QuestionTemplate): + """ + Regex for questions about the language spoken in a country. + Ex: "What is the language of Argentina?" + "what language is spoken in Argentina?" + """ + + openings = (Lemma("what") + Token("is") + Pos("DT") + + Question(Lemma("official")) + Lemma("language")) | \ + (Lemma("what") + Lemma("language") + Token("is") + + Lemma("speak")) + + regex = openings + Pos("IN") + Question(Pos("DT")) + Country() + \ + Question(Pos(".")) + + def interpret(self, match): + language = LanguageOf(match.country) + return language, "enum" + + +class PopulationOfQuestion(QuestionTemplate): + """ + Regex for questions about the population of a country. + Ex: "What is the population of China?" + "How many people live in China?" + """ + + openings = (Pos("WP") + Token("is") + Pos("DT") + + Lemma("population") + Pos("IN")) | \ + (Pos("WRB") + Lemma("many") + Lemma("people") + + Token("live") + Pos("IN")) + regex = openings + Question(Pos("DT")) + Country() + Question(Pos(".")) + + def interpret(self, match): + population = PopulationOf(match.country) + return population, "literal" diff --git a/examples/dbpedia/dbpedia/movies.py b/examples/dbpedia/dbpedia/movies.py index 9a31750..b0c1c27 100644 --- a/examples/dbpedia/dbpedia/movies.py +++ b/examples/dbpedia/dbpedia/movies.py @@ -14,7 +14,7 @@ from refo import Plus, Question from quepy.dsl import HasKeyword from quepy.parsing import Lemma, Lemmas, Pos, QuestionTemplate, Particle -from dsl import IsMovie, NameOf, IsPerson, \ +from .dsl import IsMovie, NameOf, IsPerson, \ DirectedBy, LabelOf, DurationOf, HasActor, HasName, ReleaseDateOf, \ DirectorOf, StarsIn, DefinitionOf diff --git a/examples/dbpedia/dbpedia/movies.py.bak b/examples/dbpedia/dbpedia/movies.py.bak new file mode 100644 index 0000000..9a31750 --- /dev/null +++ b/examples/dbpedia/dbpedia/movies.py.bak @@ -0,0 +1,184 @@ +# coding: utf-8 + +# Copyright (c) 2012, Machinalis S.R.L. +# This file is part of quepy and is distributed under the Modified BSD License. +# You should have received a copy of license in the LICENSE file. +# +# Authors: Rafael Carrascosa +# Gonzalo Garcia Berrotaran + +""" +Movie related regex. +""" + +from refo import Plus, Question +from quepy.dsl import HasKeyword +from quepy.parsing import Lemma, Lemmas, Pos, QuestionTemplate, Particle +from dsl import IsMovie, NameOf, IsPerson, \ + DirectedBy, LabelOf, DurationOf, HasActor, HasName, ReleaseDateOf, \ + DirectorOf, StarsIn, DefinitionOf + +nouns = Plus(Pos("NN") | Pos("NNS") | Pos("NNP") | Pos("NNPS")) + + +class Movie(Particle): + regex = Question(Pos("DT")) + nouns + + def interpret(self, match): + name = match.words.tokens + return IsMovie() + HasName(name) + + +class Actor(Particle): + regex = nouns + + def interpret(self, match): + name = match.words.tokens + return IsPerson() + HasKeyword(name) + + +class Director(Particle): + regex = nouns + + def interpret(self, match): + name = match.words.tokens + return IsPerson() + HasKeyword(name) + + +class ListMoviesQuestion(QuestionTemplate): + """ + Ex: "list movies" + """ + + regex = Lemma("list") + (Lemma("movie") | Lemma("film")) + + def interpret(self, match): + movie = IsMovie() + name = NameOf(movie) + return name, "enum" + + +class MoviesByDirectorQuestion(QuestionTemplate): + """ + Ex: "List movies directed by Quentin Tarantino. + "movies directed by Martin Scorsese" + "which movies did Mel Gibson directed" + """ + + regex = (Question(Lemma("list")) + (Lemma("movie") | Lemma("film")) + + Question(Lemma("direct")) + Lemma("by") + Director()) | \ + (Lemma("which") + (Lemma("movie") | Lemma("film")) + Lemma("do") + + Director() + Lemma("direct") + Question(Pos("."))) + + def interpret(self, match): + movie = IsMovie() + DirectedBy(match.director) + movie_name = LabelOf(movie) + + return movie_name, "enum" + + +class MovieDurationQuestion(QuestionTemplate): + """ + Ex: "How long is Pulp Fiction" + "What is the duration of The Thin Red Line?" + """ + + regex = ((Lemmas("how long be") + Movie()) | + (Lemmas("what be") + Pos("DT") + Lemma("duration") + + Pos("IN") + Movie())) + \ + Question(Pos(".")) + + def interpret(self, match): + duration = DurationOf(match.movie) + return duration, ("literal", "{} minutes long") + + +class ActedOnQuestion(QuestionTemplate): + """ + Ex: "List movies with Hugh Laurie" + "Movies with Matt LeBlanc" + "In what movies did Jennifer Aniston appear?" + "Which movies did Mel Gibson starred?" + "Movies starring Winona Ryder" + """ + + acted_on = (Lemma("appear") | Lemma("act") | Lemma("star")) + movie = (Lemma("movie") | Lemma("movies") | Lemma("film")) + regex = (Question(Lemma("list")) + movie + Lemma("with") + Actor()) | \ + (Question(Pos("IN")) + (Lemma("what") | Lemma("which")) + + movie + Lemma("do") + Actor() + acted_on + Question(Pos("."))) | \ + (Question(Pos("IN")) + Lemma("which") + movie + Lemma("do") + + Actor() + acted_on) | \ + (Question(Lemma("list")) + movie + Lemma("star") + Actor()) + + def interpret(self, match): + movie = IsMovie() + HasActor(match.actor) + movie_name = NameOf(movie) + return movie_name, "enum" + + +class MovieReleaseDateQuestion(QuestionTemplate): + """ + Ex: "When was The Red Thin Line released?" + "Release date of The Empire Strikes Back" + """ + + regex = ((Lemmas("when be") + Movie() + Lemma("release")) | + (Lemma("release") + Question(Lemma("date")) + + Pos("IN") + Movie())) + \ + Question(Pos(".")) + + def interpret(self, match): + release_date = ReleaseDateOf(match.movie) + return release_date, "literal" + + +class DirectorOfQuestion(QuestionTemplate): + """ + Ex: "Who is the director of Big Fish?" + "who directed Pocahontas?" + """ + + regex = ((Lemmas("who be") + Pos("DT") + Lemma("director") + + Pos("IN") + Movie()) | + (Lemma("who") + Lemma("direct") + Movie())) + \ + Question(Pos(".")) + + def interpret(self, match): + director = IsPerson() + DirectorOf(match.movie) + director_name = NameOf(director) + return director_name, "literal" + + +class ActorsOfQuestion(QuestionTemplate): + """ + Ex: "who are the actors of Titanic?" + "who acted in Alien?" + "who starred in Depredator?" + "Actors of Fight Club" + """ + + regex = (Lemma("who") + Question(Lemma("be") + Pos("DT")) + + (Lemma("act") | Lemma("actor") | Lemma("star")) + + Pos("IN") + Movie() + Question(Pos("."))) | \ + ((Lemma("actors") | Lemma("actor")) + Pos("IN") + Movie()) + + def interpret(self, match): + actor = NameOf(IsPerson() + StarsIn(match.movie)) + return actor, "enum" + + +class PlotOfQuestion(QuestionTemplate): + """ + Ex: "what is Shame about?" + "plot of Titanic" + """ + + regex = ((Lemmas("what be") + Movie() + Lemma("about")) | \ + (Question(Lemmas("what be the")) + Lemma("plot") + + Pos("IN") + Movie())) + \ + Question(Pos(".")) + + def interpret(self, match): + definition = DefinitionOf(match.movie) + return definition, "define" diff --git a/examples/dbpedia/dbpedia/music.py b/examples/dbpedia/dbpedia/music.py index 006371b..4904e5f 100644 --- a/examples/dbpedia/dbpedia/music.py +++ b/examples/dbpedia/dbpedia/music.py @@ -14,7 +14,7 @@ from refo import Plus, Question from quepy.dsl import HasKeyword from quepy.parsing import Lemma, Lemmas, Pos, QuestionTemplate, Particle -from dsl import IsBand, LabelOf, IsMemberOf, ActiveYears, MusicGenreOf, \ +from .dsl import IsBand, LabelOf, IsMemberOf, ActiveYears, MusicGenreOf, \ NameOf, IsAlbum, ProducedBy diff --git a/examples/dbpedia/dbpedia/music.py.bak b/examples/dbpedia/dbpedia/music.py.bak new file mode 100644 index 0000000..006371b --- /dev/null +++ b/examples/dbpedia/dbpedia/music.py.bak @@ -0,0 +1,97 @@ +# coding: utf-8 + +# Copyright (c) 2012, Machinalis S.R.L. +# This file is part of quepy and is distributed under the Modified BSD License. +# You should have received a copy of license in the LICENSE file. +# +# Authors: Rafael Carrascosa +# Gonzalo Garcia Berrotaran + +""" +Music related regex +""" + +from refo import Plus, Question +from quepy.dsl import HasKeyword +from quepy.parsing import Lemma, Lemmas, Pos, QuestionTemplate, Particle +from dsl import IsBand, LabelOf, IsMemberOf, ActiveYears, MusicGenreOf, \ + NameOf, IsAlbum, ProducedBy + + +class Band(Particle): + regex = Question(Pos("DT")) + Plus(Pos("NN") | Pos("NNP")) + + def interpret(self, match): + name = match.words.tokens.title() + return IsBand() + HasKeyword(name) + + +class BandMembersQuestion(QuestionTemplate): + """ + Regex for questions about band member. + Ex: "Radiohead members" + "What are the members of Metallica?" + """ + + regex1 = Band() + Lemma("member") + regex2 = Lemma("member") + Pos("IN") + Band() + regex3 = Pos("WP") + Lemma("be") + Pos("DT") + Lemma("member") + \ + Pos("IN") + Band() + + regex = (regex1 | regex2 | regex3) + Question(Pos(".")) + + def interpret(self, match): + member = IsMemberOf(match.band) + label = LabelOf(member) + return label, "enum" + + +class FoundationQuestion(QuestionTemplate): + """ + Regex for questions about the creation of a band. + Ex: "When was Pink Floyd founded?" + "When was Korn formed?" + """ + + regex = Pos("WRB") + Lemma("be") + Band() + \ + (Lemma("form") | Lemma("found")) + Question(Pos(".")) + + def interpret(self, match): + active_years = ActiveYears(match.band) + return active_years, "literal" + + +class GenreQuestion(QuestionTemplate): + """ + Regex for questions about the genre of a band. + Ex: "What is the music genre of Gorillaz?" + "Music genre of Radiohead" + """ + + optional_opening = Question(Pos("WP") + Lemma("be") + Pos("DT")) + regex = optional_opening + Question(Lemma("music")) + Lemma("genre") + \ + Pos("IN") + Band() + Question(Pos(".")) + + def interpret(self, match): + genre = MusicGenreOf(match.band) + label = LabelOf(genre) + return label, "enum" + + +class AlbumsOfQuestion(QuestionTemplate): + """ + Ex: "List albums of Pink Floyd" + "What albums did Pearl Jam record?" + "Albums by Metallica" + """ + + regex = (Question(Lemma("list")) + (Lemma("album") | Lemma("albums")) + \ + Pos("IN") + Band()) | \ + (Lemmas("what album do") + Band() + + (Lemma("record") | Lemma("make")) + Question(Pos("."))) | \ + (Lemma("list") + Band() + Lemma("album")) + + def interpret(self, match): + album = IsAlbum() + ProducedBy(match.band) + name = NameOf(album) + return name, "enum" diff --git a/examples/dbpedia/dbpedia/people.py b/examples/dbpedia/dbpedia/people.py index a7f263e..06e6060 100644 --- a/examples/dbpedia/dbpedia/people.py +++ b/examples/dbpedia/dbpedia/people.py @@ -14,7 +14,7 @@ from refo import Plus, Question from quepy.dsl import HasKeyword from quepy.parsing import Lemma, Lemmas, Pos, QuestionTemplate, Particle -from dsl import IsPerson, LabelOf, DefinitionOf, BirthDateOf, BirthPlaceOf +from .dsl import IsPerson, LabelOf, DefinitionOf, BirthDateOf, BirthPlaceOf class Person(Particle): diff --git a/examples/dbpedia/dbpedia/people.py.bak b/examples/dbpedia/dbpedia/people.py.bak new file mode 100644 index 0000000..a7f263e --- /dev/null +++ b/examples/dbpedia/dbpedia/people.py.bak @@ -0,0 +1,66 @@ +# coding: utf-8 + +# Copyright (c) 2012, Machinalis S.R.L. +# This file is part of quepy and is distributed under the Modified BSD License. +# You should have received a copy of license in the LICENSE file. +# +# Authors: Rafael Carrascosa +# Gonzalo Garcia Berrotaran + +""" +People related regex +""" + +from refo import Plus, Question +from quepy.dsl import HasKeyword +from quepy.parsing import Lemma, Lemmas, Pos, QuestionTemplate, Particle +from dsl import IsPerson, LabelOf, DefinitionOf, BirthDateOf, BirthPlaceOf + + +class Person(Particle): + regex = Plus(Pos("NN") | Pos("NNS") | Pos("NNP") | Pos("NNPS")) + + def interpret(self, match): + name = match.words.tokens + return IsPerson() + HasKeyword(name) + + +class WhoIs(QuestionTemplate): + """ + Ex: "Who is Tom Cruise?" + """ + + regex = Lemma("who") + Lemma("be") + Person() + \ + Question(Pos(".")) + + def interpret(self, match): + definition = DefinitionOf(match.person) + return definition, "define" + + +class HowOldIsQuestion(QuestionTemplate): + """ + Ex: "How old is Bob Dylan". + """ + + regex = Pos("WRB") + Lemma("old") + Lemma("be") + Person() + \ + Question(Pos(".")) + + def interpret(self, match): + birth_date = BirthDateOf(match.person) + return birth_date, "age" + + +class WhereIsFromQuestion(QuestionTemplate): + """ + Ex: "Where is Bill Gates from?" + """ + + regex = Lemmas("where be") + Person() + Lemma("from") + \ + Question(Pos(".")) + + def interpret(self, match): + birth_place = BirthPlaceOf(match.person) + label = LabelOf(birth_place) + + return label, "enum" diff --git a/examples/dbpedia/dbpedia/populated_place.py b/examples/dbpedia/dbpedia/populated_place.py index 3291c38..4307a1e 100644 --- a/examples/dbpedia/dbpedia/populated_place.py +++ b/examples/dbpedia/dbpedia/populated_place.py @@ -14,7 +14,7 @@ from refo import Plus, Question from quepy.dsl import HasKeyword from quepy.parsing import Lemma, Pos, QuestionTemplate, Token, Particle -from dsl import IsPopulatedPlace, IncumbentOf, CapitalOf, \ +from .dsl import IsPopulatedPlace, IncumbentOf, CapitalOf, \ LabelOf, PopulationOf diff --git a/examples/dbpedia/dbpedia/populated_place.py.bak b/examples/dbpedia/dbpedia/populated_place.py.bak new file mode 100644 index 0000000..3291c38 --- /dev/null +++ b/examples/dbpedia/dbpedia/populated_place.py.bak @@ -0,0 +1,60 @@ +# coding: utf-8 + +# Copyright (c) 2012, Machinalis S.R.L. +# This file is part of quepy and is distributed under the Modified BSD License. +# You should have received a copy of license in the LICENSE file. +# +# Authors: Rafael Carrascosa +# Gonzalo Garcia Berrotaran + +""" +Populated place related regex +""" + +from refo import Plus, Question +from quepy.dsl import HasKeyword +from quepy.parsing import Lemma, Pos, QuestionTemplate, Token, Particle +from dsl import IsPopulatedPlace, IncumbentOf, CapitalOf, \ + LabelOf, PopulationOf + + +class PopulatedPlace(Particle): + regex = Plus(Pos("DT") | Pos("NN") | Pos("NNS") | Pos("NNP") | Pos("NNPS")) + + def interpret(self, match): + name = match.words.tokens.title() + return IsPopulatedPlace() + HasKeyword(name) + + +class CapitalOfQuestion(QuestionTemplate): + """ + Regex for questions about the capital of a country. + Ex: "What is the capital of Massachussets?" + """ + + opening = Lemma("what") + Token("is") + regex = opening + Pos("DT") + Lemma("capital") + Pos("IN") + \ + Question(Pos("DT")) + PopulatedPlace() + Question(Pos(".")) + + def interpret(self, match): + capital = CapitalOf(match.populatedplace) + label = LabelOf(capital) + return label, "enum" + + +class PopulationOfQuestion(QuestionTemplate): + """ + Regex for questions about the population of a country. + Ex: "What is the population of Cordoba?" + "How many people live in Cordoba?" + """ + + openings = (Pos("WP") + Token("is") + Pos("DT") + + Lemma("population") + Pos("IN")) | \ + (Pos("WRB") + Lemma("many") + Lemma("people") + + Token("live") + Pos("IN")) + regex = openings + Question(Pos("DT")) + PopulatedPlace() + Question(Pos(".")) + + def interpret(self, match): + population = PopulationOf(match.populatedplace) + return population, "literal" diff --git a/examples/dbpedia/dbpedia/settings.py b/examples/dbpedia/dbpedia/settings.py index a75779b..2073270 100644 --- a/examples/dbpedia/dbpedia/settings.py +++ b/examples/dbpedia/dbpedia/settings.py @@ -21,7 +21,7 @@ DEFAULT_ENCODING = "utf-8" # Sparql config -SPARQL_PREAMBLE = u""" +SPARQL_PREAMBLE = """ PREFIX owl: PREFIX rdfs: PREFIX rdf: diff --git a/examples/dbpedia/dbpedia/settings.py.bak b/examples/dbpedia/dbpedia/settings.py.bak new file mode 100644 index 0000000..a75779b --- /dev/null +++ b/examples/dbpedia/dbpedia/settings.py.bak @@ -0,0 +1,34 @@ +# coding: utf-8 + +# Copyright (c) 2012, Machinalis S.R.L. +# This file is part of quepy and is distributed under the Modified BSD License. +# You should have received a copy of license in the LICENSE file. +# +# Authors: Rafael Carrascosa +# Gonzalo Garcia Berrotaran + +""" +Settings. +""" + +# Generated query language +LANGUAGE = "sparql" + +# NLTK config +NLTK_DATA_PATH = [] # List of paths with NLTK data + +# Encoding config +DEFAULT_ENCODING = "utf-8" + +# Sparql config +SPARQL_PREAMBLE = u""" +PREFIX owl: +PREFIX rdfs: +PREFIX rdf: +PREFIX foaf: +PREFIX skos: +PREFIX quepy: +PREFIX dbpedia: +PREFIX dbpprop: +PREFIX dbpedia-owl: +""" diff --git a/examples/dbpedia/dbpedia/tvshows.py b/examples/dbpedia/dbpedia/tvshows.py index 16c6144..0949d11 100644 --- a/examples/dbpedia/dbpedia/tvshows.py +++ b/examples/dbpedia/dbpedia/tvshows.py @@ -7,7 +7,7 @@ from refo import Plus, Question from quepy.dsl import HasKeyword from quepy.parsing import Lemma, Lemmas, Pos, QuestionTemplate, Particle -from dsl import IsTvShow, ReleaseDateOf, IsPerson, StarsIn, LabelOf, \ +from .dsl import IsTvShow, ReleaseDateOf, IsPerson, StarsIn, LabelOf, \ HasShowName, NumberOfEpisodesIn, HasActor, ShowNameOf, CreatorOf nouns = Plus(Pos("NN") | Pos("NNS") | Pos("NNP") | Pos("NNPS")) diff --git a/examples/dbpedia/dbpedia/tvshows.py.bak b/examples/dbpedia/dbpedia/tvshows.py.bak new file mode 100644 index 0000000..16c6144 --- /dev/null +++ b/examples/dbpedia/dbpedia/tvshows.py.bak @@ -0,0 +1,124 @@ +# coding: utf-8 + +""" +Tv Shows related regex. +""" + +from refo import Plus, Question +from quepy.dsl import HasKeyword +from quepy.parsing import Lemma, Lemmas, Pos, QuestionTemplate, Particle +from dsl import IsTvShow, ReleaseDateOf, IsPerson, StarsIn, LabelOf, \ + HasShowName, NumberOfEpisodesIn, HasActor, ShowNameOf, CreatorOf + +nouns = Plus(Pos("NN") | Pos("NNS") | Pos("NNP") | Pos("NNPS")) + + +class TvShow(Particle): + regex = Plus(Question(Pos("DT")) + nouns) + + def interpret(self, match): + name = match.words.tokens + return IsTvShow() + HasShowName(name) + + +class Actor(Particle): + regex = nouns + + def interpret(self, match): + name = match.words.tokens + return IsPerson() + HasKeyword(name) + + +# FIXME: clash with movies release regex +class ReleaseDateQuestion(QuestionTemplate): + """ + Ex: when was Friends release? + """ + + regex = Lemmas("when be") + TvShow() + Lemma("release") + \ + Question(Pos(".")) + + def interpret(self, match): + release_date = ReleaseDateOf(match.tvshow) + return release_date, "literal" + + +class CastOfQuestion(QuestionTemplate): + """ + Ex: "What is the cast of Friends?" + "Who works in Breaking Bad?" + "List actors of Seinfeld" + """ + + regex = (Question(Lemmas("what be") + Pos("DT")) + + Lemma("cast") + Pos("IN") + TvShow() + Question(Pos("."))) | \ + (Lemmas("who works") + Pos("IN") + TvShow() + + Question(Pos("."))) | \ + (Lemmas("list actor") + Pos("IN") + TvShow()) + + def interpret(self, match): + actor = IsPerson() + StarsIn(match.tvshow) + name = LabelOf(actor) + return name, "enum" + + +class ListTvShows(QuestionTemplate): + """ + Ex: "List TV shows" + """ + + regex = Lemmas("list tv show") + + def interpret(self, match): + show = IsTvShow() + label = LabelOf(show) + return label, "enum" + + +class EpisodeCountQuestion(QuestionTemplate): + """ + Ex: "How many episodes does Seinfeld have?" + "Number of episodes of Seinfeld" + """ + + regex = ((Lemmas("how many episode do") + TvShow() + Lemma("have")) | + (Lemma("number") + Pos("IN") + Lemma("episode") + + Pos("IN") + TvShow())) + \ + Question(Pos(".")) + + def interpret(self, match): + number_of_episodes = NumberOfEpisodesIn(match.tvshow) + return number_of_episodes, "literal" + + +class ShowsWithQuestion(QuestionTemplate): + """ + Ex: "List shows with Hugh Laurie" + "In what shows does Jennifer Aniston appears?" + "Shows with Matt LeBlanc" + """ + + regex = (Lemmas("list show") + Pos("IN") + Actor()) | \ + (Pos("IN") + (Lemma("what") | Lemma("which")) + Lemmas("show do") + + Actor() + (Lemma("appear") | Lemma("work")) + + Question(Pos("."))) | \ + ((Lemma("show") | Lemma("shows")) + Pos("IN") + Actor()) + + def interpret(self, match): + show = IsTvShow() + HasActor(match.actor) + show_name = ShowNameOf(show) + return show_name, "enum" + + +class CreatorOfQuestion(QuestionTemplate): + """ + Ex: "Who is the creator of Breaking Bad?" + """ + + regex = Question(Lemmas("who be") + Pos("DT")) + \ + Lemma("creator") + Pos("IN") + TvShow() + Question(Pos(".")) + + def interpret(self, match): + creator = CreatorOf(match.tvshow) + label = LabelOf(creator) + return label, "enum" diff --git a/examples/dbpedia/dbpedia/writers.py b/examples/dbpedia/dbpedia/writers.py index 5affc14..49173cf 100644 --- a/examples/dbpedia/dbpedia/writers.py +++ b/examples/dbpedia/dbpedia/writers.py @@ -15,7 +15,7 @@ from refo import Plus, Question from quepy.dsl import HasKeyword from quepy.parsing import Lemma, Lemmas, Pos, QuestionTemplate, Particle -from dsl import IsBook, HasAuthor, AuthorOf, IsPerson, NameOf +from .dsl import IsBook, HasAuthor, AuthorOf, IsPerson, NameOf nouns = Pos("DT") | Pos("IN") | Pos("NN") | Pos("NNS") | Pos("NNP") | Pos("NNPS") diff --git a/examples/dbpedia/dbpedia/writers.py.bak b/examples/dbpedia/dbpedia/writers.py.bak new file mode 100644 index 0000000..5affc14 --- /dev/null +++ b/examples/dbpedia/dbpedia/writers.py.bak @@ -0,0 +1,69 @@ +# coding: utf-8 + +# Copyright (c) 2012, Machinalis S.R.L. +# This file is part of quepy and is distributed under the Modified BSD License. +# You should have received a copy of license in the LICENSE file. +# +# Authors: Rafael Carrascosa +# Gonzalo Garcia Berrotaran + +""" +Writers related regex. +""" + + +from refo import Plus, Question +from quepy.dsl import HasKeyword +from quepy.parsing import Lemma, Lemmas, Pos, QuestionTemplate, Particle +from dsl import IsBook, HasAuthor, AuthorOf, IsPerson, NameOf + + +nouns = Pos("DT") | Pos("IN") | Pos("NN") | Pos("NNS") | Pos("NNP") | Pos("NNPS") + + +class Book(Particle): + regex = Plus(nouns) + + def interpret(self, match): + name = match.words.tokens + return IsBook() + HasKeyword(name) + + +class Author(Particle): + regex = Plus(nouns | Lemma(".")) + + def interpret(self, match): + name = match.words.tokens + return IsPerson() + HasKeyword(name) + + +class WhoWroteQuestion(QuestionTemplate): + """ + Ex: "who wrote The Little Prince?" + "who is the author of A Game Of Thrones?" + """ + + regex = ((Lemmas("who write") + Book()) | + (Question(Lemmas("who be") + Pos("DT")) + + Lemma("author") + Pos("IN") + Book())) + \ + Question(Pos(".")) + + def interpret(self, match): + author = NameOf(IsPerson() + AuthorOf(match.book)) + return author, "literal" + + +class BooksByAuthorQuestion(QuestionTemplate): + """ + Ex: "list books by George Orwell" + "which books did Suzanne Collins wrote?" + """ + + regex = (Question(Lemma("list")) + Lemmas("book by") + Author()) | \ + ((Lemma("which") | Lemma("what")) + Lemmas("book do") + + Author() + Lemma("write") + Question(Pos("."))) + + def interpret(self, match): + book = IsBook() + HasAuthor(match.author) + book_name = NameOf(book) + return book_name, "enum" diff --git a/examples/dbpedia/main.py b/examples/dbpedia/main.py index 39eee69..86b2ccf 100644 --- a/examples/dbpedia/main.py +++ b/examples/dbpedia/main.py @@ -29,29 +29,29 @@ def print_define(results, target, metadata=None): for result in results["results"]["bindings"]: if result[target]["xml:lang"] == "en": - print result[target]["value"] - print + print(result[target]["value"]) + print() def print_enum(results, target, metadata=None): used_labels = [] for result in results["results"]["bindings"]: - if result[target]["type"] == u"literal": + if result[target]["type"] == "literal": if result[target]["xml:lang"] == "en": label = result[target]["value"] if label not in used_labels: used_labels.append(label) - print label + print(label) def print_literal(results, target, metadata=None): for result in results["results"]["bindings"]: literal = result[target]["value"] if metadata: - print metadata.format(literal) + print(metadata.format(literal)) else: - print literal + print(literal) def print_time(results, target, metadata=None): @@ -59,7 +59,7 @@ def print_time(results, target, metadata=None): gmt = datetime.datetime.fromtimestamp(gmt) for result in results["results"]["bindings"]: - offset = result[target]["value"].replace(u"−", u"-") + offset = result[target]["value"].replace("−", "-") if ("to" in offset) or ("and" in offset): if "to" in offset: @@ -83,11 +83,11 @@ def print_time(results, target, metadata=None): location_string = random.choice(["where you are", "your location"]) - print "Between %s %s %s, depending on %s" % \ + print("Between %s %s %s, depending on %s" % \ (from_time.strftime("%H:%M"), connector, to_time.strftime("%H:%M on %A"), - location_string) + location_string)) else: offset = int(offset) @@ -95,7 +95,7 @@ def print_time(results, target, metadata=None): delta = datetime.timedelta(hours=offset) the_time = gmt + delta - print the_time.strftime("%H:%M on %A") + print(the_time.strftime("%H:%M on %A")) def print_age(results, target, metadata=None): @@ -110,7 +110,7 @@ def print_age(results, target, metadata=None): now = now.date() age = now - birth_date - print "{} years old".format(age.days / 365) + print("{} years old".format(age.days / 365)) def wikipedia2dbpedia(wikipedia_url): @@ -131,7 +131,7 @@ def wikipedia2dbpedia(wikipedia_url): results = sparql.query().convert() if not results["results"]["bindings"]: - print "Snorql URL not found" + print("Snorql URL not found") sys.exit(1) else: return results["results"]["bindings"][0]["url"]["value"] @@ -163,7 +163,7 @@ def wikipedia2dbpedia(wikipedia_url): question = " ".join(sys.argv[1:]) if question.count("wikipedia.org"): - print wikipedia2dbpedia(sys.argv[1]) + print(wikipedia2dbpedia(sys.argv[1])) sys.exit(0) else: questions = [question] @@ -179,8 +179,8 @@ def wikipedia2dbpedia(wikipedia_url): } for question in questions: - print question - print "-" * len(question) + print(question) + print("-" * len(question)) target, query, metadata = dbpedia.get_query(question) @@ -192,10 +192,10 @@ def wikipedia2dbpedia(wikipedia_url): metadata = None if query is None: - print "Query not generated :(\n" + print("Query not generated :(\n") continue - print query + print(query) if target.startswith("?"): target = target[1:] @@ -205,8 +205,8 @@ def wikipedia2dbpedia(wikipedia_url): results = sparql.query().convert() if not results["results"]["bindings"]: - print "No answer found :(" + print("No answer found :(") continue print_handlers[query_type](results, target, metadata) - print + print() diff --git a/examples/dbpedia/main.py.bak b/examples/dbpedia/main.py.bak new file mode 100644 index 0000000..39eee69 --- /dev/null +++ b/examples/dbpedia/main.py.bak @@ -0,0 +1,212 @@ +#!/usr/bin/env python +# coding: utf-8 + +# Copyright (c) 2012, Machinalis S.R.L. +# This file is part of quepy and is distributed under the Modified BSD License. +# You should have received a copy of license in the LICENSE file. +# +# Authors: Rafael Carrascosa +# Gonzalo Garcia Berrotaran + +""" +Main script for DBpedia quepy. +""" + +import sys +import time +import random +import datetime + +import quepy +from SPARQLWrapper import SPARQLWrapper, JSON + +sparql = SPARQLWrapper("http://dbpedia.org/sparql") +dbpedia = quepy.install("dbpedia") + +# quepy.set_loglevel("DEBUG") + + +def print_define(results, target, metadata=None): + for result in results["results"]["bindings"]: + if result[target]["xml:lang"] == "en": + print result[target]["value"] + print + + +def print_enum(results, target, metadata=None): + used_labels = [] + + for result in results["results"]["bindings"]: + if result[target]["type"] == u"literal": + if result[target]["xml:lang"] == "en": + label = result[target]["value"] + if label not in used_labels: + used_labels.append(label) + print label + + +def print_literal(results, target, metadata=None): + for result in results["results"]["bindings"]: + literal = result[target]["value"] + if metadata: + print metadata.format(literal) + else: + print literal + + +def print_time(results, target, metadata=None): + gmt = time.mktime(time.gmtime()) + gmt = datetime.datetime.fromtimestamp(gmt) + + for result in results["results"]["bindings"]: + offset = result[target]["value"].replace(u"−", u"-") + + if ("to" in offset) or ("and" in offset): + if "to" in offset: + connector = "and" + from_offset, to_offset = offset.split("to") + else: + connector = "or" + from_offset, to_offset = offset.split("and") + + from_offset, to_offset = int(from_offset), int(to_offset) + + if from_offset > to_offset: + from_offset, to_offset = to_offset, from_offset + + from_delta = datetime.timedelta(hours=from_offset) + to_delta = datetime.timedelta(hours=to_offset) + + from_time = gmt + from_delta + to_time = gmt + to_delta + + location_string = random.choice(["where you are", + "your location"]) + + print "Between %s %s %s, depending on %s" % \ + (from_time.strftime("%H:%M"), + connector, + to_time.strftime("%H:%M on %A"), + location_string) + + else: + offset = int(offset) + + delta = datetime.timedelta(hours=offset) + the_time = gmt + delta + + print the_time.strftime("%H:%M on %A") + + +def print_age(results, target, metadata=None): + assert len(results["results"]["bindings"]) == 1 + + birth_date = results["results"]["bindings"][0][target]["value"] + year, month, days = birth_date.split("-") + + birth_date = datetime.date(int(year), int(month), int(days)) + + now = datetime.datetime.utcnow() + now = now.date() + + age = now - birth_date + print "{} years old".format(age.days / 365) + + +def wikipedia2dbpedia(wikipedia_url): + """ + Given a wikipedia URL returns the dbpedia resource + of that page. + """ + + query = """ + PREFIX foaf: + SELECT * WHERE { + ?url foaf:isPrimaryTopicOf <%s>. + } + """ % wikipedia_url + + sparql.setQuery(query) + sparql.setReturnFormat(JSON) + results = sparql.query().convert() + + if not results["results"]["bindings"]: + print "Snorql URL not found" + sys.exit(1) + else: + return results["results"]["bindings"][0]["url"]["value"] + + +if __name__ == "__main__": + default_questions = [ + "What is a car?", + "Who is Tom Cruise?", + "Who is George Lucas?", + "Who is Mirtha Legrand?", + # "List Microsoft software", + "Name Fiat cars", + "time in argentina", + "what time is it in Chile?", + "List movies directed by Martin Scorsese", + "How long is Pulp Fiction", + "which movies did Mel Gibson starred?", + "When was Gladiator released?", + "who directed Pocahontas?", + "actors of Fight Club", + ] + + if "-d" in sys.argv: + quepy.set_loglevel("DEBUG") + sys.argv.remove("-d") + + if len(sys.argv) > 1: + question = " ".join(sys.argv[1:]) + + if question.count("wikipedia.org"): + print wikipedia2dbpedia(sys.argv[1]) + sys.exit(0) + else: + questions = [question] + else: + questions = default_questions + + print_handlers = { + "define": print_define, + "enum": print_enum, + "time": print_time, + "literal": print_literal, + "age": print_age, + } + + for question in questions: + print question + print "-" * len(question) + + target, query, metadata = dbpedia.get_query(question) + + if isinstance(metadata, tuple): + query_type = metadata[0] + metadata = metadata[1] + else: + query_type = metadata + metadata = None + + if query is None: + print "Query not generated :(\n" + continue + + print query + + if target.startswith("?"): + target = target[1:] + if query: + sparql.setQuery(query) + sparql.setReturnFormat(JSON) + results = sparql.query().convert() + + if not results["results"]["bindings"]: + print "No answer found :(" + continue + + print_handlers[query_type](results, target, metadata) + print diff --git a/examples/freebase/freebase/__init__.py b/examples/freebase/freebase/__init__.py index d3777a6..abebe7f 100644 --- a/examples/freebase/freebase/__init__.py +++ b/examples/freebase/freebase/__init__.py @@ -5,10 +5,10 @@ Init for freebase quepy. """ -from basic import * -from music import * -from people import * -from movies import * -from country import * -from tvshows import * -from writers import * \ No newline at end of file +from .basic import * +from .music import * +from .people import * +from .movies import * +from .country import * +from .tvshows import * +from .writers import * \ No newline at end of file diff --git a/examples/freebase/freebase/__init__.py.bak b/examples/freebase/freebase/__init__.py.bak new file mode 100644 index 0000000..d3777a6 --- /dev/null +++ b/examples/freebase/freebase/__init__.py.bak @@ -0,0 +1,14 @@ +#!/usr/bin/env python +# coding: utf-8 + +""" +Init for freebase quepy. +""" + +from basic import * +from music import * +from people import * +from movies import * +from country import * +from tvshows import * +from writers import * \ No newline at end of file diff --git a/examples/freebase/freebase/basic.py b/examples/freebase/freebase/basic.py index 83b81b5..a79a4fe 100644 --- a/examples/freebase/freebase/basic.py +++ b/examples/freebase/freebase/basic.py @@ -12,7 +12,7 @@ """ from refo import Question, Plus -from dsl import DefinitionOf, NameOf, LocationOf +from .dsl import DefinitionOf, NameOf, LocationOf from quepy.dsl import HasKeyword from quepy.parsing import QuestionTemplate, Particle, Lemma, Pos, Lemmas diff --git a/examples/freebase/freebase/basic.py.bak b/examples/freebase/freebase/basic.py.bak new file mode 100644 index 0000000..83b81b5 --- /dev/null +++ b/examples/freebase/freebase/basic.py.bak @@ -0,0 +1,54 @@ +# coding: utf-8 + +# Copyright (c) 2012, Machinalis S.R.L. +# This file is part of quepy and is distributed under the Modified BSD License. +# You should have received a copy of license in the LICENSE file. +# +# Authors: Rafael Carrascosa +# Gonzalo Garcia Berrotaran + +""" +Basic questions for Freebase. +""" + +from refo import Question, Plus +from dsl import DefinitionOf, NameOf, LocationOf +from quepy.dsl import HasKeyword +from quepy.parsing import QuestionTemplate, Particle, Lemma, Pos, Lemmas + + +class Thing(Particle): + regex = Plus(Question(Pos("JJ")) + (Pos("NN") | Pos("NNP") | Pos("NNS")) | + Pos("VBN")) + + def interpret(self, match): + return HasKeyword(match.words.tokens) + + +class WhatIs(QuestionTemplate): + """ + Regex for questions like "What is a blowtorch + Ex: "What is a car" + "What is Seinfield?" + """ + + regex = Lemma("what") + Lemma("be") + Question(Pos("DT")) + \ + Thing() + Question(Pos(".")) + + def interpret(self, match): + label = DefinitionOf(match.thing) + return label + + +class WhereIsQuestion(QuestionTemplate): + """ + Ex: "where in the world is the Eiffel Tower" + """ + + regex = Lemma("where") + Question(Lemmas("in the world")) + Lemma("be") + \ + Question(Pos("DT")) + Thing() + Question(Pos(".")) + + def interpret(self, match): + location = LocationOf(match.thing) + location_name = NameOf(location) + return location_name diff --git a/examples/freebase/freebase/country.py b/examples/freebase/freebase/country.py index dcdea15..886e4e0 100644 --- a/examples/freebase/freebase/country.py +++ b/examples/freebase/freebase/country.py @@ -11,7 +11,7 @@ Coutry related regex """ -from dsl import * +from .dsl import * from refo import Plus, Question from quepy.dsl import HasKeyword from quepy.parsing import Lemma, Pos, QuestionTemplate, Token, Particle diff --git a/examples/freebase/freebase/country.py.bak b/examples/freebase/freebase/country.py.bak new file mode 100644 index 0000000..dcdea15 --- /dev/null +++ b/examples/freebase/freebase/country.py.bak @@ -0,0 +1,94 @@ +# coding: utf-8 + +# Copyright (c) 2012, Machinalis S.R.L. +# This file is part of quepy and is distributed under the Modified BSD License. +# You should have received a copy of license in the LICENSE file. +# +# Authors: Rafael Carrascosa +# Gonzalo Garcia Berrotaran + +""" +Coutry related regex +""" + +from dsl import * +from refo import Plus, Question +from quepy.dsl import HasKeyword +from quepy.parsing import Lemma, Pos, QuestionTemplate, Token, Particle + + +class Country(Particle): + regex = Plus(Pos("DT") | Pos("NN") | Pos("NNS") | Pos("NNP") | Pos("NNPS")) + + def interpret(self, match): + name = match.words.tokens.title() + return IsCountry() + HasKeyword(name) + + +class PresidentOfQuestion(QuestionTemplate): + """ + Ex: "list presidents of Argentina?" + """ + + regex = Question(Lemma("list")) + Lemma("president") + Pos("IN") + \ + Country() + Question(Pos(".")) + + def interpret(self, match): + president = IsPresident() + PresidentOf(match.country) + name = NameOf(OfficeHolderOf(president)) + return name + + +class CapitalOfQuestion(QuestionTemplate): + """ + Regex for questions about the capital of a country. + Ex: "What is the capital of Bolivia?" + """ + + opening = Lemma("what") + Token("is") + regex = opening + Pos("DT") + Lemma("capital") + Pos("IN") + \ + Question(Pos("DT")) + Country() + Question(Pos(".")) + + def interpret(self, match): + capital = CapitalOf(match.country) + label = NameOf(capital) + return label + + +class LanguageOfQuestion(QuestionTemplate): + """ + Regex for questions about the language spoken in a country. + Ex: "What is the language of Argentina?" + "what language is spoken in Argentina?" + """ + + openings = (Lemma("what") + Token("is") + Pos("DT") + + Question(Lemma("official")) + Lemma("language")) | \ + (Lemma("what") + Lemma("language") + Token("is") + + Lemma("speak")) + + regex = openings + Pos("IN") + Question(Pos("DT")) + Country() + \ + Question(Pos(".")) + + def interpret(self, match): + language = LanguageOf(match.country) + name = NameOf(language) + return name + + +class PopulationOfQuestion(QuestionTemplate): + """ + Regex for questions about the population of a country. + Ex: "What is the population of China?" + "How many people live in China?" + """ + + openings = (Pos("WP") + Token("is") + Pos("DT") + + Lemma("population") + Pos("IN")) | \ + (Pos("WRB") + Lemma("many") + Lemma("people") + + Token("live") + Pos("IN")) + regex = openings + Question(Pos("DT")) + Country() + Question(Pos(".")) + + def interpret(self, match): + population = NumberOf(PopulationOf(match.country)) + return population diff --git a/examples/freebase/freebase/movies.py b/examples/freebase/freebase/movies.py index 38837ef..4a5a3fc 100644 --- a/examples/freebase/freebase/movies.py +++ b/examples/freebase/freebase/movies.py @@ -14,7 +14,7 @@ from refo import Plus, Question from quepy.dsl import HasKeyword from quepy.parsing import Lemma, Lemmas, Pos, QuestionTemplate, Particle -from dsl import * +from .dsl import * nouns = Plus(Pos("NN") | Pos("NNS") | Pos("NNP") | Pos("NNPS")) diff --git a/examples/freebase/freebase/movies.py.bak b/examples/freebase/freebase/movies.py.bak new file mode 100644 index 0000000..38837ef --- /dev/null +++ b/examples/freebase/freebase/movies.py.bak @@ -0,0 +1,184 @@ +# coding: utf-8 + +# Copyright (c) 2012, Machinalis S.R.L. +# This file is part of quepy and is distributed under the Modified BSD License. +# You should have received a copy of license in the LICENSE file. +# +# Authors: Rafael Carrascosa +# Gonzalo Garcia Berrotaran + +""" +Movie related regex. +""" + +from refo import Plus, Question +from quepy.dsl import HasKeyword +from quepy.parsing import Lemma, Lemmas, Pos, QuestionTemplate, Particle +from dsl import * + +nouns = Plus(Pos("NN") | Pos("NNS") | Pos("NNP") | Pos("NNPS")) + + +class Movie(Particle): + regex = Question(Pos("DT")) + nouns + + def interpret(self, match): + name = match.words.tokens + return IsMovie() + HasName(name) + + +class Actor(Particle): + regex = nouns + + def interpret(self, match): + name = match.words.tokens + return IsPerson() + IsActor() + HasKeyword(name) + + +class Director(Particle): + regex = nouns + + def interpret(self, match): + name = match.words.tokens + return IsPerson() + IsDirector() + HasKeyword(name) + + +class ListMoviesQuestion(QuestionTemplate): + """ + Ex: "list movies" + """ + + regex = Lemma("list") + (Lemma("movie") | Lemma("film")) + + def interpret(self, match): + movie = IsMovie() + name = NameOf(movie) + return name + + +class MoviesByDirectorQuestion(QuestionTemplate): + """ + Ex: "List movies directed by Quentin Tarantino. + "movies directed by Martin Scorsese" + "which movies did Mel Gibson directed" + """ + + regex = (Question(Lemma("list")) + (Lemma("movie") | Lemma("film")) + + Question(Lemma("direct")) + Lemma("by") + Director()) | \ + (Lemma("which") + (Lemma("movie") | Lemma("film")) + Lemma("do") + + Director() + Lemma("direct") + Question(Pos("."))) + + def interpret(self, match): + movie = IsMovie() + DirectedBy(match.director) + movie_name = NameOf(movie) + return movie_name + + +class MovieDurationQuestion(QuestionTemplate): + """ + Ex: "How long is Pulp Fiction" + "What is the duration of The Thin Red Line?" + """ + + regex = ((Lemmas("how long be") + Movie()) | + (Lemmas("what be") + Pos("DT") + Lemma("duration") + + Pos("IN") + Movie())) + \ + Question(Pos(".")) + + def interpret(self, match): + duration = DurationOf(RuntimeOf(match.movie)) + return duration + + +class ActedOnQuestion(QuestionTemplate): + """ + Ex: "List movies with Hugh Laurie" + "Movies with Matt LeBlanc" + "In what movies did Jennifer Aniston appear?" + "Which movies did Mel Gibson starred?" + "Movies starring Winona Ryder" + """ + + acted_on = (Lemma("appear") | Lemma("act") | Lemma("star")) + movie = (Lemma("movie") | Lemma("movies") | Lemma("film")) + regex = (Question(Lemma("list")) + movie + Lemma("with") + Actor()) | \ + (Question(Pos("IN")) + (Lemma("what") | Lemma("which")) + + movie + Lemma("do") + Actor() + acted_on + Question(Pos("."))) | \ + (Question(Pos("IN")) + Lemma("which") + movie + Lemma("do") + + Actor() + acted_on) | \ + (Question(Lemma("list")) + movie + Lemma("star") + Actor()) + + def interpret(self, match): + performance = IsPerformance() + PerformanceOfActor(match.actor) + movie = IsMovie() + HasPerformance(performance) + movie_name = NameOf(movie) + return movie_name + + +class MovieReleaseDateQuestion(QuestionTemplate): + """ + Ex: "When was The Red Thin Line released?" + "Release date of The Empire Strikes Back" + """ + + regex = ((Lemmas("when be") + Movie() + Lemma("release")) | + (Lemma("release") + Question(Lemma("date")) + + Pos("IN") + Movie())) + \ + Question(Pos(".")) + + def interpret(self, match): + release_date = ReleaseDateOf(match.movie) + return release_date + + +class DirectorOfQuestion(QuestionTemplate): + """ + Ex: "Who is the director of Big Fish?" + "who directed Pocahontas?" + """ + + regex = ((Lemmas("who be") + Pos("DT") + Lemma("director") + + Pos("IN") + Movie()) | + (Lemma("who") + Lemma("direct") + Movie())) + \ + Question(Pos(".")) + + def interpret(self, match): + director = IsPerson() + DirectorOf(match.movie) + director_name = NameOf(director) + return director_name + + +class ActorsOfQuestion(QuestionTemplate): + """ + Ex: "who are the actors of Titanic?" + "who acted in Alien?" + "who starred in Depredator?" + "Actors of Fight Club" + """ + + regex = (Lemma("who") + Question(Lemma("be") + Pos("DT")) + + (Lemma("act") | Lemma("actor") | Lemma("star")) + + Pos("IN") + Movie() + Question(Pos("."))) | \ + ((Lemma("actors") | Lemma("actor")) + Pos("IN") + Movie()) + + def interpret(self, match): + performance = IsPerformance() + PerformanceOfMovie(match.movie) + actor = IsActor() + PerformsIn(performance) + name = NameOf(actor) + return name + + +class PlotOfQuestion(QuestionTemplate): + """ + Ex: "what is Shame about?" + "plot of Titanic" + """ + + regex = ((Lemmas("what be") + Movie() + Lemma("about")) | \ + (Question(Lemmas("what be the")) + Lemma("plot") + + Pos("IN") + Movie())) + \ + Question(Pos(".")) + + def interpret(self, match): + definition = DefinitionOf(match.movie) + return definition diff --git a/examples/freebase/freebase/music.py b/examples/freebase/freebase/music.py index f38d01a..29df091 100644 --- a/examples/freebase/freebase/music.py +++ b/examples/freebase/freebase/music.py @@ -11,7 +11,7 @@ Music related regex """ -from dsl import * +from .dsl import * from refo import Plus, Question from quepy.dsl import HasKeyword from quepy.parsing import Lemma, Lemmas, Pos, QuestionTemplate, Particle diff --git a/examples/freebase/freebase/music.py.bak b/examples/freebase/freebase/music.py.bak new file mode 100644 index 0000000..f38d01a --- /dev/null +++ b/examples/freebase/freebase/music.py.bak @@ -0,0 +1,97 @@ +# coding: utf-8 + +# Copyright (c) 2012, Machinalis S.R.L. +# This file is part of quepy and is distributed under the Modified BSD License. +# You should have received a copy of license in the LICENSE file. +# +# Authors: Rafael Carrascosa +# Gonzalo Garcia Berrotaran + +""" +Music related regex +""" + +from dsl import * +from refo import Plus, Question +from quepy.dsl import HasKeyword +from quepy.parsing import Lemma, Lemmas, Pos, QuestionTemplate, Particle + + +class Band(Particle): + regex = Question(Pos("DT")) + Plus(Pos("NN") | Pos("NNP")) + + def interpret(self, match): + name = match.words.tokens.title() + return IsBand() + HasKeyword(name) + + +class BandMembersQuestion(QuestionTemplate): + """ + Regex for questions about band member. + Ex: "Radiohead members" + "What are the members of Metallica?" + """ + + regex1 = Band() + Lemma("member") + regex2 = Lemma("member") + Pos("IN") + Band() + regex3 = Pos("WP") + Lemma("be") + Pos("DT") + Lemma("member") + \ + Pos("IN") + Band() + + regex = (regex1 | regex2 | regex3) + Question(Pos(".")) + + def interpret(self, match): + group = GroupOf(match.band) + member = IsPerson() + IsMusicArtist() + IsMemberOf(group) + name = NameOf(member) + return name + + +class FoundationQuestion(QuestionTemplate): + """ + Regex for questions about the creation of a band. + Ex: "When was Pink Floyd founded?" + "When was Korn formed?" + """ + + regex = Pos("WRB") + Lemma("be") + Band() + \ + (Lemma("form") | Lemma("found")) + Question(Pos(".")) + + def interpret(self, match): + active_years = ActiveYearsOf(match.band) + return active_years + + +class GenreQuestion(QuestionTemplate): + """ + Regex for questions about the genre of a band. + Ex: "What is the music genre of Gorillaz?" + "Music genre of Radiohead" + """ + + optional_opening = Question(Pos("WP") + Lemma("be") + Pos("DT")) + regex = optional_opening + Question(Lemma("music")) + Lemma("genre") + \ + Pos("IN") + Band() + Question(Pos(".")) + + def interpret(self, match): + genre = MusicGenreOf(match.band) + name = NameOf(genre) + return name + + +class AlbumsOfQuestion(QuestionTemplate): + """ + Ex: "List albums of Pink Floyd" + "What albums did Pearl Jam record?" + "Albums by Metallica" + """ + + regex = (Question(Lemma("list")) + (Lemma("album") | Lemma("albums")) + \ + Pos("IN") + Band()) | \ + (Lemmas("what album do") + Band() + + (Lemma("record") | Lemma("make")) + Question(Pos("."))) | \ + (Lemma("list") + Band() + Lemma("album")) + + def interpret(self, match): + album = IsAlbum() + ProducedBy(match.band) + name = NameOf(album) + return name diff --git a/examples/freebase/freebase/people.py b/examples/freebase/freebase/people.py index 045e43a..cf6137f 100644 --- a/examples/freebase/freebase/people.py +++ b/examples/freebase/freebase/people.py @@ -11,7 +11,7 @@ People related regex """ -from dsl import * +from .dsl import * from refo import Plus, Question from quepy.dsl import HasKeyword from quepy.parsing import Lemma, Lemmas, Pos, QuestionTemplate, Particle diff --git a/examples/freebase/freebase/people.py.bak b/examples/freebase/freebase/people.py.bak new file mode 100644 index 0000000..045e43a --- /dev/null +++ b/examples/freebase/freebase/people.py.bak @@ -0,0 +1,65 @@ +# -*- coding: utf-8 -*- + +# Copyright (c) 2012, Machinalis S.R.L. +# This file is part of quepy and is distributed under the Modified BSD License. +# You should have received a copy of license in the LICENSE file. +# +# Authors: Rafael Carrascosa +# Gonzalo Garcia Berrotaran + +""" +People related regex +""" + +from dsl import * +from refo import Plus, Question +from quepy.dsl import HasKeyword +from quepy.parsing import Lemma, Lemmas, Pos, QuestionTemplate, Particle + + +class Person(Particle): + regex = Plus(Pos("NN") | Pos("NNS") | Pos("NNP") | Pos("NNPS")) + + def interpret(self, match): + name = match.words.tokens + return IsPerson() + HasKeyword(name) + + +class WhoIs(QuestionTemplate): + """ + Ex: "Who is Tom Cruise?" + """ + + regex = Lemma("who") + Lemma("be") + Person() + \ + Question(Pos(".")) + + def interpret(self, match): + definition = DefinitionOf(match.person) + return definition + + +class HowOldIsQuestion(QuestionTemplate): + """ + Ex: "How old is Bob Dylan". + """ + + regex = Pos("WRB") + Lemma("old") + Lemma("be") + Person() + \ + Question(Pos(".")) + + def interpret(self, match): + birth_date = BirthDateOf(match.person) + return birth_date + + +class WhereIsFromQuestion(QuestionTemplate): + """ + Ex: "Where is Bill Gates from?" + """ + + regex = Lemmas("where be") + Person() + Lemma("from") + \ + Question(Pos(".")) + + def interpret(self, match): + birth_place = BirthPlaceOf(match.person) + name = NameOf(birth_place) + return name diff --git a/examples/freebase/freebase/tvshows.py b/examples/freebase/freebase/tvshows.py index 1f7f096..879989f 100644 --- a/examples/freebase/freebase/tvshows.py +++ b/examples/freebase/freebase/tvshows.py @@ -4,7 +4,7 @@ Tv Shows related regex. """ -from dsl import * +from .dsl import * from refo import Plus, Question from quepy.dsl import HasKeyword from quepy.parsing import Lemma, Lemmas, Pos, QuestionTemplate, Particle diff --git a/examples/freebase/freebase/tvshows.py.bak b/examples/freebase/freebase/tvshows.py.bak new file mode 100644 index 0000000..1f7f096 --- /dev/null +++ b/examples/freebase/freebase/tvshows.py.bak @@ -0,0 +1,112 @@ +# coding: utf-8 + +""" +Tv Shows related regex. +""" + +from dsl import * +from refo import Plus, Question +from quepy.dsl import HasKeyword +from quepy.parsing import Lemma, Lemmas, Pos, QuestionTemplate, Particle + +nouns = Plus(Pos("NN") | Pos("NNS") | Pos("NNP") | Pos("NNPS")) + + +class TvShow(Particle): + regex = Plus(Question(Pos("DT")) + nouns) + + def interpret(self, match): + name = match.words.tokens + return IsTvShow() + HasName(name) + + +class Actor(Particle): + regex = nouns + + def interpret(self, match): + name = match.words.tokens + return IsPerson() + HasName(name) + + +class CastOfQuestion(QuestionTemplate): + """ + Ex: "What is the cast of Friends?" + "Who works in Breaking Bad?" + "List actors of Seinfeld" + """ + + regex = (Question(Lemmas("what be") + Pos("DT")) + + Lemma("cast") + Pos("IN") + TvShow() + Question(Pos("."))) | \ + (Lemmas("who works") + Pos("IN") + TvShow() + + Question(Pos("."))) | \ + (Lemmas("list actor") + Pos("IN") + TvShow()) + + def interpret(self, match): + cast = CastOf(match.tvshow) + actor = IsPerson() + IsActorOf(cast) + name = NameOf(actor) + return name + + +class ListTvShows(QuestionTemplate): + """ + Ex: "List TV shows" + """ + + regex = Lemmas("list tv show") + + def interpret(self, match): + show = IsTvShow() + label = NameOf(show) + return label + + +class EpisodeCountQuestion(QuestionTemplate): + """ + Ex: "How many episodes does Seinfeld have?" + "Number of episodes of Seinfeld" + """ + + regex = ((Lemmas("how many episode do") + TvShow() + Lemma("have")) | + (Lemma("number") + Pos("IN") + Lemma("episode") + + Pos("IN") + TvShow())) + \ + Question(Pos(".")) + + def interpret(self, match): + number_of_episodes = NumberOfEpisodesIn(match.tvshow) + return number_of_episodes + + +class ShowsWithQuestion(QuestionTemplate): + """ + Ex: "List shows with Hugh Laurie" + "In what shows does Jennifer Aniston appears?" + "Shows with Matt LeBlanc" + """ + + regex = (Lemmas("list show") + Pos("IN") + Actor()) | \ + (Pos("IN") + (Lemma("what") | Lemma("which")) + Lemmas("show do") + + Actor() + (Lemma("appear") | Lemma("work")) + + Question(Pos("."))) | \ + ((Lemma("show") | Lemma("shows")) + Pos("IN") + Actor()) + + def interpret(self, match): + cast = HasActor(match.actor) + show = IsTvShow() + HasCast(cast) + show_name = NameOf(show) + return show_name + + +class CreatorOfQuestion(QuestionTemplate): + """ + Ex: "Who is the creator of Breaking Bad?" + "Who are the creators of Friends?" + """ + + regex = Question(Lemmas("who be") + Pos("DT")) + \ + Lemma("creator") + Pos("IN") + TvShow() + Question(Pos(".")) + + def interpret(self, match): + creator = CreatorOf(match.tvshow) + name = NameOf(creator) + return name diff --git a/examples/freebase/freebase/writers.py b/examples/freebase/freebase/writers.py index f66e330..407a4a4 100644 --- a/examples/freebase/freebase/writers.py +++ b/examples/freebase/freebase/writers.py @@ -12,7 +12,7 @@ """ -from dsl import * +from .dsl import * from refo import Plus, Question from quepy.dsl import HasKeyword from quepy.parsing import Lemma, Lemmas, Pos, QuestionTemplate, Particle diff --git a/examples/freebase/freebase/writers.py.bak b/examples/freebase/freebase/writers.py.bak new file mode 100644 index 0000000..f66e330 --- /dev/null +++ b/examples/freebase/freebase/writers.py.bak @@ -0,0 +1,69 @@ +# coding: utf-8 + +# Copyright (c) 2012, Machinalis S.R.L. +# This file is part of quepy and is distributed under the Modified BSD License. +# You should have received a copy of license in the LICENSE file. +# +# Authors: Rafael Carrascosa +# Gonzalo Garcia Berrotaran + +""" +Writers related regex. +""" + + +from dsl import * +from refo import Plus, Question +from quepy.dsl import HasKeyword +from quepy.parsing import Lemma, Lemmas, Pos, QuestionTemplate, Particle + + +nouns = Pos("DT") | Pos("IN") | Pos("NN") | Pos("NNS") | Pos("NNP") | Pos("NNPS") + + +class Book(Particle): + regex = Plus(nouns) + + def interpret(self, match): + name = match.words.tokens + return IsBook() + HasKeyword(name) + + +class Author(Particle): + regex = Plus(nouns | Lemma(".")) + + def interpret(self, match): + name = match.words.tokens + return IsPerson() + HasKeyword(name) + + +class WhoWroteQuestion(QuestionTemplate): + """ + Ex: "who wrote The Little Prince?" + "who is the author of A Game Of Thrones?" + """ + + regex = ((Lemmas("who write") + Book()) | + (Question(Lemmas("who be") + Pos("DT")) + + Lemma("author") + Pos("IN") + Book())) + \ + Question(Pos(".")) + + def interpret(self, match): + author = NameOf(IsPerson() + AuthorOf(match.book)) + return author + + +class BooksByAuthorQuestion(QuestionTemplate): + """ + Ex: "list books by George Orwell" + "which books did Suzanne Collins wrote?" + """ + + regex = (Question(Lemma("list")) + Lemmas("book by") + Author()) | \ + ((Lemma("which") | Lemma("what")) + Lemmas("book do") + + Author() + Lemma("write") + Question(Pos("."))) + + def interpret(self, match): + book = IsBook() + HasAuthor(match.author) + book_name = NameOf(book) + return book_name diff --git a/examples/freebase/main.py b/examples/freebase/main.py index 4d3fe95..0765034 100644 --- a/examples/freebase/main.py +++ b/examples/freebase/main.py @@ -13,7 +13,7 @@ import json import quepy -import urllib +import urllib.request, urllib.parse, urllib.error from docopt import docopt service_url = 'https://www.googleapis.com/freebase/v1/mqlread' @@ -22,8 +22,8 @@ def request(query): params = {'query': query} - url = service_url + '?' + urllib.urlencode(params) - responses = json.loads(urllib.urlopen(url).read()) + url = service_url + '?' + urllib.parse.urlencode(params) + responses = json.loads(urllib.request.urlopen(url).read()) return responses @@ -54,14 +54,14 @@ def result_from_responses(responses, target): args = docopt(__doc__) question = " ".join(args[""]) target, query, metadata = freebase.get_query(question) - print query + print(query) if args["--request"]: - print + print() responses = request(query) if "error" in responses: - print responses + print(responses) exit() else: for response in result_from_responses(responses, target): - print response + print(response) diff --git a/examples/freebase/main.py.bak b/examples/freebase/main.py.bak new file mode 100644 index 0000000..4d3fe95 --- /dev/null +++ b/examples/freebase/main.py.bak @@ -0,0 +1,67 @@ +#!/usr/bin/env python +# coding: utf-8 + +""" +Main script for freebase quepy. + +Usage: + main.py [options] ... + +Options: + -r --request Queries the online database and prints the results +""" + +import json +import quepy +import urllib +from docopt import docopt + +service_url = 'https://www.googleapis.com/freebase/v1/mqlread' +freebase = quepy.install("freebase") + + +def request(query): + params = {'query': query} + url = service_url + '?' + urllib.urlencode(params) + responses = json.loads(urllib.urlopen(url).read()) + return responses + + +def result_from_responses(responses, target): + if responses: + to_explore = responses["result"] + for key in target: + _to_explore = [] + for elem in to_explore: + for response in elem[key]: + _to_explore.append(response) + to_explore = _to_explore + + result = [] + for elem in to_explore: + if isinstance(elem, dict): + if "lang" in elem: + if elem["lang"] == "/lang/en": + result.append(elem.get("value", elem)) + else: + result.append(elem.get("value", elem)) + else: + result.append(elem) + return result + + +if __name__ == "__main__": + args = docopt(__doc__) + question = " ".join(args[""]) + target, query, metadata = freebase.get_query(question) + print query + + if args["--request"]: + print + responses = request(query) + if "error" in responses: + print responses + exit() + else: + for response in result_from_responses(responses, target): + print response diff --git a/quepy/cntagger.py b/quepy/cntagger.py index 0b42946..9f50366 100644 --- a/quepy/cntagger.py +++ b/quepy/cntagger.py @@ -13,7 +13,7 @@ from quepy.encodingpolicy import assert_valid_encoding logger = logging.getLogger("quepy.tagger") -PENN_TAGSET = set(u"$ `` '' ( ) , -- . : CC CD DT EX FW IN JJ JJR JJS LS MD " +PENN_TAGSET = set("$ `` '' ( ) , -- . : CC CD DT EX FW IN JJ JJR JJS LS MD " "NN NNP NNPS NNS PDT POS PRP PRP$ RB RBR RBS RP SYM TO UH " "VB VBD VBG VBN VBP VBZ WDT WP WP$ WRB".split()) @@ -31,8 +31,8 @@ class Word(object): Contains *token*, *lemma*, *pos tag* and optionally a *probability* of that tag. """ - _encoding_attrs = u"token lemma pos".split() - _attrs = _encoding_attrs + [u"prob"] + _encoding_attrs = "token lemma pos".split() + _attrs = _encoding_attrs + ["prob"] def __init__(self, token, lemma=None, pos=None, prob=None): self.pos = pos @@ -46,11 +46,11 @@ def __setattr__(self, name, value): object.__setattr__(self, name, value) def __unicode__(self): - attrs = (getattr(self, name, u"-") for name in self._attrs) - return u"|".join(str(x) for x in attrs) + attrs = (getattr(self, name, "-") for name in self._attrs) + return "|".join(str(x) for x in attrs) def __repr__(self): - return unicode(self) + return str(self) def get_cntagger(): diff --git a/quepy/cntagger.py.bak b/quepy/cntagger.py.bak new file mode 100644 index 0000000..0b42946 --- /dev/null +++ b/quepy/cntagger.py.bak @@ -0,0 +1,74 @@ +# coding: utf-8 + +# Copyright (c) 2012, Machinalis S.R.L. +# This file is part of quepy and is distributed under the Modified BSD License. +# You should have received a copy of license in the LICENSE file. +# +# Authors: Rafael Carrascosa +# Gonzalo Garcia Berrotaran + +import logging + +from quepy import settings +from quepy.encodingpolicy import assert_valid_encoding + +logger = logging.getLogger("quepy.tagger") +PENN_TAGSET = set(u"$ `` '' ( ) , -- . : CC CD DT EX FW IN JJ JJR JJS LS MD " + "NN NNP NNPS NNS PDT POS PRP PRP$ RB RBR RBS RP SYM TO UH " + "VB VBD VBG VBN VBP VBZ WDT WP WP$ WRB".split()) + + +class TaggingError(Exception): + """ + Error parsing tagger's output. + """ + pass + + +class Word(object): + """ + Representation of a tagged word. + Contains *token*, *lemma*, *pos tag* and optionally a *probability* of + that tag. + """ + _encoding_attrs = u"token lemma pos".split() + _attrs = _encoding_attrs + [u"prob"] + + def __init__(self, token, lemma=None, pos=None, prob=None): + self.pos = pos + self.prob = prob + self.lemma = lemma + self.token = token + + def __setattr__(self, name, value): + if name in self._encoding_attrs and value is not None: + assert_valid_encoding(value) + object.__setattr__(self, name, value) + + def __unicode__(self): + attrs = (getattr(self, name, u"-") for name in self._attrs) + return u"|".join(str(x) for x in attrs) + + def __repr__(self): + return unicode(self) + + +def get_cntagger(): + """ + Return a tagging function given some app settings. + `Settings` is the settings module of an app. + The returned value is a function that receives a unicode string and returns + a list of `Word` instances. + """ + from quepy.jiebatagger import run_jiebatagger + tagger_function = lambda x: run_jiebaagger(x) + + def wrapper(string): + assert_valid_encoding(string) + words = tagger_function(string) + for word in words: + if word.pos not in PENN_TAGSET: + logger.warning("Tagger emmited a non-penn " + "POS tag {!r}".format(word.pos)) + return words + return wrapper diff --git a/quepy/dot_generation.py b/quepy/dot_generation.py index e812990..ab728d3 100644 --- a/quepy/dot_generation.py +++ b/quepy/dot_generation.py @@ -11,38 +11,38 @@ def escape(x, add_quotes=True): - x = unicode(x) - x = x.replace(u" ", u"_") - x = x.replace(u"\n", u"") - x = x.replace(u"\00", u"") - x = x.replace(u"[", u"") - x = x.replace(u"]", u"") - x = x.replace(u"\\", u"") + x = str(x) + x = x.replace(" ", "_") + x = x.replace("\n", "") + x = x.replace("\00", "") + x = x.replace("[", "") + x = x.replace("]", "") + x = x.replace("\\", "") if x.count("\""): - x = x.replace(u"\"", u"\\\"") + x = x.replace("\"", "\\\"") if add_quotes: - x = u'"' + x + u'"' + x = '"' + x + '"' return x def adapt(x): if isnode(x): - x = u"x{}".format(x) + x = "x{}".format(x) return x - if isinstance(x, basestring): + if isinstance(x, str): assert_valid_encoding(x) x = escape(x) - if x.startswith(u"\""): + if x.startswith("\""): return x - return u'"{}"'.format(x) - return unicode(x) + return '"{}"'.format(x) + return str(x) def expression_to_dot(e): - d = {u"rdf:type": dot_type, + d = {"rdf:type": dot_type, HasKeyword.relation: dot_keyword, - IsRelatedTo: lambda x, y: dot_arc(x, u"", y)} - s = u"digraph G {{\n{0} [shape=house];\n{1}\n}}\n" + IsRelatedTo: lambda x, y: dot_arc(x, "", y)} + s = "digraph G {{\n{0} [shape=house];\n{1}\n}}\n" xs = [] for node in e.iter_nodes(): for relation, other in e.iter_edges(node): @@ -55,35 +55,35 @@ def expression_to_dot(e): else: x = dot_arc(node1, relation, node2) xs.append(x) - return None, s.format(adapt(e.head), u"".join(xs)) + return None, s.format(adapt(e.head), "".join(xs)) def dot_arc(a, label, b): - assert u" " not in a and u" " not in b - assert u"\n" not in a + label + b - return u"{0} -> {1} [label=\"{2}\"];\n".format(a, b, label) + assert " " not in a and " " not in b + assert "\n" not in a + label + b + return "{0} -> {1} [label=\"{2}\"];\n".format(a, b, label) def dot_type(a, t): - s = u"{0} [shape=box];\n".format(t) - return s + u"{0} -> {1} [color=red, arrowhead=empty];".format(a, t) + s = "{0} [shape=box];\n".format(t) + return s + "{0} -> {1} [color=red, arrowhead=empty];".format(a, t) def dot_attribute(a, key): blank = id(a) - s = u"{0} [shape=none label={1}];\n".format(blank, key) - return s + u"{0} -> {1};".format(a, blank) + s = "{0} [shape=none label={1}];\n".format(blank, key) + return s + "{0} -> {1};".format(a, blank) def dot_keyword(a, key): - blank = u"{0:.30f}".format(random.random()) - blank = u"blank" + blank.replace(u".", u"") - s = u"{0} [shape=none label={1}];\n".format(blank, key) - return s + u"{0} -> {1} [style=dashed];".format(a, blank) + blank = "{0:.30f}".format(random.random()) + blank = "blank" + blank.replace(".", "") + s = "{0} [shape=none label={1}];\n".format(blank, key) + return s + "{0} -> {1} [style=dashed];".format(a, blank) def dot_fixed_type(a, fixedtype): - blank = u"{0:.30f}".format(random.random()) - blank = u"blank" + blank.replace(u".", u"") - s = u"{0} [shape=box label={1}];\n".format(blank, fixedtype) - return s + u"{0} -> {1} [color=red, arrowhead=empty];".format(a, blank) + blank = "{0:.30f}".format(random.random()) + blank = "blank" + blank.replace(".", "") + s = "{0} [shape=box label={1}];\n".format(blank, fixedtype) + return s + "{0} -> {1} [color=red, arrowhead=empty];".format(a, blank) diff --git a/quepy/dot_generation.py.bak b/quepy/dot_generation.py.bak new file mode 100644 index 0000000..e812990 --- /dev/null +++ b/quepy/dot_generation.py.bak @@ -0,0 +1,89 @@ +# -*- coding: utf-8 -*- + +""" +Dot generation code. +""" + +import random +from quepy.expression import isnode +from quepy.dsl import IsRelatedTo, HasKeyword +from quepy.encodingpolicy import assert_valid_encoding + + +def escape(x, add_quotes=True): + x = unicode(x) + x = x.replace(u" ", u"_") + x = x.replace(u"\n", u"") + x = x.replace(u"\00", u"") + x = x.replace(u"[", u"") + x = x.replace(u"]", u"") + x = x.replace(u"\\", u"") + if x.count("\""): + x = x.replace(u"\"", u"\\\"") + if add_quotes: + x = u'"' + x + u'"' + return x + + +def adapt(x): + if isnode(x): + x = u"x{}".format(x) + return x + if isinstance(x, basestring): + assert_valid_encoding(x) + x = escape(x) + if x.startswith(u"\""): + return x + return u'"{}"'.format(x) + return unicode(x) + + +def expression_to_dot(e): + d = {u"rdf:type": dot_type, + HasKeyword.relation: dot_keyword, + IsRelatedTo: lambda x, y: dot_arc(x, u"", y)} + s = u"digraph G {{\n{0} [shape=house];\n{1}\n}}\n" + xs = [] + for node in e.iter_nodes(): + for relation, other in e.iter_edges(node): + node1 = adapt(node) + node2 = adapt(other) + relation = escape(relation, add_quotes=False) + + if relation in d: + x = d[relation](node1, node2) + else: + x = dot_arc(node1, relation, node2) + xs.append(x) + return None, s.format(adapt(e.head), u"".join(xs)) + + +def dot_arc(a, label, b): + assert u" " not in a and u" " not in b + assert u"\n" not in a + label + b + return u"{0} -> {1} [label=\"{2}\"];\n".format(a, b, label) + + +def dot_type(a, t): + s = u"{0} [shape=box];\n".format(t) + return s + u"{0} -> {1} [color=red, arrowhead=empty];".format(a, t) + + +def dot_attribute(a, key): + blank = id(a) + s = u"{0} [shape=none label={1}];\n".format(blank, key) + return s + u"{0} -> {1};".format(a, blank) + + +def dot_keyword(a, key): + blank = u"{0:.30f}".format(random.random()) + blank = u"blank" + blank.replace(u".", u"") + s = u"{0} [shape=none label={1}];\n".format(blank, key) + return s + u"{0} -> {1} [style=dashed];".format(a, blank) + + +def dot_fixed_type(a, fixedtype): + blank = u"{0:.30f}".format(random.random()) + blank = u"blank" + blank.replace(u".", u"") + s = u"{0} [shape=box label={1}];\n".format(blank, fixedtype) + return s + u"{0} -> {1} [color=red, arrowhead=empty];".format(a, blank) diff --git a/quepy/dsl.py b/quepy/dsl.py index 74b77f5..fc66b71 100644 --- a/quepy/dsl.py +++ b/quepy/dsl.py @@ -45,7 +45,7 @@ class FixedType(Expression): """ fixedtype = None - fixedtyperelation = u"rdf:type" # FIXME: sparql specific + fixedtyperelation = "rdf:type" # FIXME: sparql specific def __init__(self): super(FixedType, self).__init__() @@ -75,7 +75,7 @@ def __init__(self, data): self.relation = encoding_flexible_conversion(self.relation) if self.language is not None: self.language = encoding_flexible_conversion(self.language) - data = u"\"{0}\"@{1}".format(data, self.language) + data = "\"{0}\"@{1}".format(data, self.language) self.add_data(self.relation, data) @@ -84,7 +84,7 @@ class HasKeyword(FixedDataRelation): Abstraction of an information retrieval key, something standarized used to look up things in the database. """ - relation = u"quepy:Keyword" + relation = "quepy:Keyword" def __init__(self, data): data = self.sanitize(data) diff --git a/quepy/dsl.py.bak b/quepy/dsl.py.bak new file mode 100644 index 0000000..74b77f5 --- /dev/null +++ b/quepy/dsl.py.bak @@ -0,0 +1,106 @@ +# coding: utf-8 +# pylint: disable=C0111 + +# Copyright (c) 2012, Machinalis S.R.L. +# This file is part of quepy and is distributed under the Modified BSD License. +# You should have received a copy of license in the LICENSE file. +# +# Authors: Rafael Carrascosa +# Gonzalo Garcia Berrotaran + +""" +Domain specific language definitions. +""" + +from copy import copy +from quepy.expression import Expression +from quepy.encodingpolicy import encoding_flexible_conversion + + +class FixedRelation(Expression): + """ + Expression for a fixed relation. It states that "A is related to B" + through the relation defined in `relation`. + """ + + relation = None + reverse = False + + def __init__(self, destination, reverse=None): + if reverse is None: + reverse = self.reverse + super(FixedRelation, self).__init__() + if self.relation is None: + raise ValueError("You *must* define the `relation` " + "class attribute to use this class.") + self.nodes = copy(destination.nodes) + self.head = destination.head + self.decapitate(self.relation, reverse) + + +class FixedType(Expression): + """ + Expression for a fixed type. + This captures the idea of something having an specific type. + """ + + fixedtype = None + fixedtyperelation = u"rdf:type" # FIXME: sparql specific + + def __init__(self): + super(FixedType, self).__init__() + if self.fixedtype is None: + raise ValueError("You *must* define the `fixedtype` " + "class attribute to use this class.") + self.fixedtype = encoding_flexible_conversion(self.fixedtype) + self.fixedtyperelation = \ + encoding_flexible_conversion(self.fixedtyperelation) + self.add_data(self.fixedtyperelation, self.fixedtype) + + +class FixedDataRelation(Expression): + """ + Expression for a fixed relation. This is + "A is related to Data" through the relation defined in `relation`. + """ + + relation = None + language = None + + def __init__(self, data): + super(FixedDataRelation, self).__init__() + if self.relation is None: + raise ValueError("You *must* define the `relation` " + "class attribute to use this class.") + self.relation = encoding_flexible_conversion(self.relation) + if self.language is not None: + self.language = encoding_flexible_conversion(self.language) + data = u"\"{0}\"@{1}".format(data, self.language) + self.add_data(self.relation, data) + + +class HasKeyword(FixedDataRelation): + """ + Abstraction of an information retrieval key, something standarized used + to look up things in the database. + """ + relation = u"quepy:Keyword" + + def __init__(self, data): + data = self.sanitize(data) + super(HasKeyword, self).__init__(data) + + @staticmethod + def sanitize(text): + # User can redefine this method if needed + return text + + +class HasType(FixedRelation): + relation = "rdf:type" + + +class IsRelatedTo(FixedRelation): + pass +# Looks weird, yes, here I am using `IsRelatedTo` as a unique identifier. +IsRelatedTo.relation = IsRelatedTo diff --git a/quepy/encodingpolicy.py b/quepy/encodingpolicy.py index a415f59..dcfca31 100644 --- a/quepy/encodingpolicy.py +++ b/quepy/encodingpolicy.py @@ -25,16 +25,16 @@ def encoding_flexible_conversion(string, complain=False): converting a string that had to be on the right encoding. """ - if isinstance(string, unicode): + if isinstance(string, str): return string try: ustring = string.decode(settings.DEFAULT_ENCODING) except UnicodeError: - message = u"Argument must be unicode or {}" + message = "Argument must be unicode or {}" raise ValueError(message.format(settings.DEFAULT_ENCODING)) if complain: - logger.warning(u"Forced to guess the encoding of {!r}, please " - u"provide a unicode string instead".format(string)) + logger.warning("Forced to guess the encoding of {!r}, please " + "provide a unicode string instead".format(string)) return ustring @@ -44,5 +44,5 @@ def assert_valid_encoding(string): ValueError exception. """ - if not isinstance(string, unicode): - raise ValueError(u"Argument must be unicode") + if not isinstance(string, str): + raise ValueError("Argument must be unicode") diff --git a/quepy/encodingpolicy.py.bak b/quepy/encodingpolicy.py.bak new file mode 100644 index 0000000..a415f59 --- /dev/null +++ b/quepy/encodingpolicy.py.bak @@ -0,0 +1,48 @@ +# coding: utf-8 + +# Copyright (c) 2012, Machinalis S.R.L. +# This file is part of quepy and is distributed under the Modified BSD License. +# You should have received a copy of license in the LICENSE file. +# +# Authors: Rafael Carrascosa +# Gonzalo Garcia Berrotaran + +""" +Functions to do encoding checkings. +""" + +import logging +from quepy import settings +logger = logging.getLogger("quepy.encodingpolicy") + + +def encoding_flexible_conversion(string, complain=False): + """ + Converts string to the proper encoding if it's possible + and if it's not raises a ValueError exception. + + If complain it's True, it will emit a logging warning about + converting a string that had to be on the right encoding. + """ + + if isinstance(string, unicode): + return string + try: + ustring = string.decode(settings.DEFAULT_ENCODING) + except UnicodeError: + message = u"Argument must be unicode or {}" + raise ValueError(message.format(settings.DEFAULT_ENCODING)) + if complain: + logger.warning(u"Forced to guess the encoding of {!r}, please " + u"provide a unicode string instead".format(string)) + return ustring + + +def assert_valid_encoding(string): + """ + If string it's not in a valid encoding it raises a + ValueError exception. + """ + + if not isinstance(string, unicode): + raise ValueError(u"Argument must be unicode") diff --git a/quepy/expression.py b/quepy/expression.py index 0f32310..b4dd3fd 100644 --- a/quepy/expression.py +++ b/quepy/expression.py @@ -174,7 +174,7 @@ def iter_nodes(self): """ Iterates the indexes (the unique identifiers) of the Expression nodes. """ - return xrange(len(self.nodes)) + return range(len(self.nodes)) def iter_edges(self, node): """ diff --git a/quepy/expression.py.bak b/quepy/expression.py.bak new file mode 100644 index 0000000..0f32310 --- /dev/null +++ b/quepy/expression.py.bak @@ -0,0 +1,210 @@ +# coding: utf-8 + +# Copyright (c) 2012, Machinalis S.R.L. +# This file is part of quepy and is distributed under the Modified BSD License. +# You should have received a copy of license in the LICENSE file. +# +# Authors: Rafael Carrascosa +# Gonzalo Garcia Berrotaran + +""" +This file implements the ``Expression`` class. + +``Expression`` is the base class for all the semantic representations in quepy. +It's meant to carry all the information necessary to build a database query in +an abstract form. + +By design it's aimed specifically to represent a SPARQL query, but it should +be able to represent queries in other database languages too. + +A (simple) SPARQL query can be thought as a subgraph that has to match into a +larger graph (the database). Each node of the subgraph is a variable and every +edge a relation. So in order to represent a query, ``Expression`` implements a +this subgraph using adjacency lists. + +Also, ``Expression`` instances are meant to be combined with each other somehow +to make complex queries out of simple ones (this is one of the main objectives +of quepy). + +To do that, every ``Expression`` has a special node called the ``head``, which +is the target node (variable) of the represented query. All operations over +``Expression`` instances work on the ``head`` node, leaving the rest of the +nodes intact. + +So ``Expression`` graphs are not built by explicitly adding nodes and edges +like any other normal graph. Instead they are built by a combination of the +following basic operations: + + - ``__init__``: When a ``Expression`` is instantiated a single solitary + node is created in the graph. + + - ``decapitate``: Creates a blank node and makes it the new ``head`` of the + ``Expression``. Then it adds an edge (a relation) linking + this new head to the old one. So in a single operation a + node and an edge are added. Used to represent stuff like + ``?x rdf:type ?y``. + + - ``add_data``: Adds a relation into some constant data from the ``head`` + node of the ``Expression``. Used to represent stuff like + ``?x rdf:label "John Von Neumann"``. + + - ``merge``: Given two ``Expressions``, it joins their graphs preserving + every node and every edge intact except for their ``head`` + nodes. + The ``head`` nodes are merged into a single node that is the + new ``head`` and shares all the edges of the previous heads. + This is used to combine two graphs like this: + + :: + + A = ?x rdf:type ?y + B = ?x rdf:label "John Von Neumann" + + Into a new one: + + :: + + A + B = ?x rdf:type ?y; + ?x rdf:label "John Von Neumann" + + +You might be saying "Why?! oh gosh why you did it like this?!". +The reasons are: + + - It allows other parts of the code to build queries in a super + intuive language, like ``IsPerson() + HasKeyword("Russell")``. + Go and see the DBpedia example. + + - You can only build connected graphs (ie, no useless variables in query). + + - You cannot have variable name clashes. + + - You cannot build cycles into the graph (could be a con to some, a + plus to other(it's a plus to me)) + + - There are just 3 really basic operations and their semantics are defined + concisely without special cases (if you care for that kind of stuff + (I do)). +""" + + +from collections import defaultdict +from copy import deepcopy + + +def isnode(x): + return isinstance(x, int) + + +class Expression(object): + + def __init__(self): + """ + Creates a new graph with a single solitary blank node. + """ + self.nodes = [] + self.head = self._add_node() + + def _add_node(self): + """ + Adds a blank node to the graph and returns its index (a unique + identifier). + """ + i = len(self.nodes) + self.nodes.append([]) + return i + + def get_head(self): + """ + Returns the index (the unique identifier) of the head node. + """ + return self.head + + def merge(self, other): + """ + Given other Expression, it joins their graphs preserving every + node and every edge intact except for the ``head`` nodes. + The ``head`` nodes are merged into a single node that is the new + ``head`` and shares all the edges of the previous heads. + """ + translation = defaultdict(self._add_node) + translation[other.head] = self.head + for node in other.iter_nodes(): + for relation, dest in other.iter_edges(node): + xs = self.nodes[translation[node]] + if isnode(dest): + dest = translation[dest] + xs.append((relation, dest)) + + def decapitate(self, relation, reverse=False): + """ + Creates a new blank node and makes it the ``head`` of the + Expression. Then it adds an edge (a ``relation``) linking the + the new head to the old one. So in a single operation a + node and an edge are added. + If ``reverse`` is ``True`` then the ``relation`` links the old head to + the new head instead of the opposite (some relations are not + commutative). + """ + oldhead = self.head + self.head = self._add_node() + if reverse: + self.nodes[oldhead].append((relation, self.head)) + else: + self.nodes[self.head].append((relation, oldhead)) + + def add_data(self, relation, value): + """ + Adds a ``relation`` to some constant ``value`` to the ``head`` of the + Expression. + ``value`` is recommended be of type: + - ``unicode`` + - ``str`` and can be decoded using the default encoding (settings.py) + - A custom class that implements a ``__unicode__`` method. + - It can *NEVER* be an ``int``. + + You should not use this to relate nodes in the graph, only to add + data fields to a node. + To relate nodes in a graph use a combination of merge and decapitate. + """ + assert not isnode(value) + self.nodes[self.head].append((relation, value)) + + def iter_nodes(self): + """ + Iterates the indexes (the unique identifiers) of the Expression nodes. + """ + return xrange(len(self.nodes)) + + def iter_edges(self, node): + """ + Iterates over the pairs: ``(relation, index)`` which are the neighbors + of ``node`` in the expression graph, where: + - ``node`` is the index of the node (the unique identifier). + - ``relation`` is the label of the edge between the nodes + - ``index`` is the index of the neighbor (the unique identifier). + """ + return iter(self.nodes[node]) + + def __add__(self, other): + """ + Merges ``self`` and ``other`` in a new Expression instance. + Ie, ``self`` and ``other`` are not modified. + """ + new = deepcopy(self) + new.merge(other) + return new + + def __iadd__(self, other): + """ + Merges ``self`` and ``other`` into ``self`` + ``other`` is not modified but the original data in ``self`` is lost. + """ + self.merge(other) + return self + + def __len__(self): + """ + Amount of nodes in the graph. + """ + return len(self.nodes) diff --git a/quepy/generation.py b/quepy/generation.py index 87b5d9f..28dc6f9 100644 --- a/quepy/generation.py +++ b/quepy/generation.py @@ -34,5 +34,5 @@ def get_code(expression, language): elif language == "mql": return generate_mql(expression) else: - message = u"Language '{}' is not supported" + message = "Language '{}' is not supported" raise ValueError(message.format(language)) diff --git a/quepy/generation.py.bak b/quepy/generation.py.bak new file mode 100644 index 0000000..87b5d9f --- /dev/null +++ b/quepy/generation.py.bak @@ -0,0 +1,38 @@ +# coding: utf-8 + +# Copyright (c) 2012, Machinalis S.R.L. +# This file is part of quepy and is distributed under the Modified BSD License. +# You should have received a copy of license in the LICENSE file. +# +# Authors: Rafael Carrascosa +# Gonzalo Garcia Berrotaran + +""" +Code generation from an expression to a database language. + +The currently supported languages are: + * MQL + * Sparql + * Dot: generation of graph images mainly for debugging. +""" + +from quepy.mql_generation import generate_mql +from quepy.dot_generation import expression_to_dot +from quepy.sparql_generation import expression_to_sparql + + +def get_code(expression, language): + """ + Given an expression and a supported language, it + returns the query for that expression on that language. + """ + + if language == "sparql": + return expression_to_sparql(expression) + elif language == "dot": + return expression_to_dot(expression) + elif language == "mql": + return generate_mql(expression) + else: + message = u"Language '{}' is not supported" + raise ValueError(message.format(language)) diff --git a/quepy/jiebatagger.py b/quepy/jiebatagger.py index a9a238d..95f9526 100644 --- a/quepy/jiebatagger.py +++ b/quepy/jiebatagger.py @@ -25,7 +25,7 @@ def penn_to_morphy_tag(tag): assert_valid_encoding(tag) - for penn, morphy in _penn_to_morphy_tag.iteritems(): + for penn, morphy in _penn_to_morphy_tag.items(): if tag.startswith(penn): return morphy return None @@ -43,10 +43,10 @@ def run_jiebatagger(string): if not _penn_to_morphy_tag: _penn_to_morphy_tag = { - u'NN': wordnet.NOUN, - u'JJ': wordnet.ADJ, - u'VB': wordnet.VERB, - u'RB': wordnet.ADV, + 'NN': wordnet.NOUN, + 'JJ': wordnet.ADJ, + 'VB': wordnet.VERB, + 'RB': wordnet.ADV, } # Recommended tokenizer doesn't handle non-ascii characters very well diff --git a/quepy/jiebatagger.py.bak b/quepy/jiebatagger.py.bak new file mode 100644 index 0000000..a9a238d --- /dev/null +++ b/quepy/jiebatagger.py.bak @@ -0,0 +1,68 @@ +# coding: utf-8 + +# Copyright (c) 2012, Machinalis S.R.L. +# This file is part of quepy and is distributed under the Modified BSD License. +# You should have received a copy of license in the LICENSE file. +# +# Authors: Rafael Carrascosa +# Gonzalo Garcia Berrotaran + +""" +Tagging using Jieba. +""" + +# Requiered data files are: +# - "averaged_perceptron_tagger" in Models +# - "wordnet" in Corpora + +import jieba +from quepy.tagger import Word +from quepy.encodingpolicy import assert_valid_encoding + +_penn_to_morphy_tag = {} + + +def penn_to_morphy_tag(tag): + assert_valid_encoding(tag) + + for penn, morphy in _penn_to_morphy_tag.iteritems(): + if tag.startswith(penn): + return morphy + return None + + +def run_jiebatagger(string): + """ + Runs jieba tagger on `string` and returns a list of + :class:`quepy.tagger.Word` objects. + """ + assert_valid_encoding(string) + global _penn_to_morphy_tag + + from nltk.corpus import wordnet + + if not _penn_to_morphy_tag: + _penn_to_morphy_tag = { + u'NN': wordnet.NOUN, + u'JJ': wordnet.ADJ, + u'VB': wordnet.VERB, + u'RB': wordnet.ADV, + } + + # Recommended tokenizer doesn't handle non-ascii characters very well + #tokens = jieba.word_tokenize(string) + token_tags = jieba.posseg.cut(string) + + words = [] + for token, pos in token_tags: + word = Word(token) + # Eliminates stuff like JJ|CC + # decode ascii because they are the penn-like POS tags (are ascii). + word.pos = pos.split("|")[0].decode("ascii") + + mtag = penn_to_morphy_tag(word.pos) + word.lemma = None + + words.append(word) + + return words diff --git a/quepy/mql_generation.py b/quepy/mql_generation.py index 97b3bd7..fc39b68 100644 --- a/quepy/mql_generation.py +++ b/quepy/mql_generation.py @@ -25,13 +25,13 @@ def safely_to_unicode(x): Given an "edge" (a relation) or "a data" from an `Expression` graph transform it into a unicode string fitted for insertion into a MQL query. """ - if isinstance(x, unicode): + if isinstance(x, str): return x if isinstance(x, str): return encoding_flexible_conversion(x) if isinstance(x, IsRelatedTo): - return u"/type/reflect/any_master" - return unicode(x) # FIXME: Any object is unicode-able, this is error prone + return "/type/reflect/any_master" + return str(x) # FIXME: Any object is unicode-able, this is error prone def to_bidirected_graph(e): @@ -45,7 +45,7 @@ def to_bidirected_graph(e): for relation, other in e.iter_edges(node): relation = safely_to_unicode(relation) if isnode(other): - graph[other].append((u"!" + relation, node)) + graph[other].append(("!" + relation, node)) else: other = safely_to_unicode(other) graph[node].append((relation, other)) diff --git a/quepy/mql_generation.py.bak b/quepy/mql_generation.py.bak new file mode 100644 index 0000000..97b3bd7 --- /dev/null +++ b/quepy/mql_generation.py.bak @@ -0,0 +1,141 @@ +# -*- coding: utf-8 -*- + +import re +import json +from quepy.dsl import IsRelatedTo +from quepy.expression import isnode +from quepy.encodingpolicy import encoding_flexible_conversion + + +def choose_start_node(e): + """ + Choose a node of the `Expression` such that no property leading to a data + has to be reversed (with !). + """ + # Since data "nodes" have no outgoing edges it sufices to find any node + # with an outgoing edge. + for node in e.iter_nodes(): + if list(e.iter_edges(node)): + return node + return node + + +def safely_to_unicode(x): + """ + Given an "edge" (a relation) or "a data" from an `Expression` graph + transform it into a unicode string fitted for insertion into a MQL query. + """ + if isinstance(x, unicode): + return x + if isinstance(x, str): + return encoding_flexible_conversion(x) + if isinstance(x, IsRelatedTo): + return u"/type/reflect/any_master" + return unicode(x) # FIXME: Any object is unicode-able, this is error prone + + +def to_bidirected_graph(e): + """ + Rewrite the graph such that there are reversed edges for every forward + edge. + If an edge goes into a data, it should not be reversed. + """ + graph = {node: [] for node in e.iter_nodes()} + for node in e.iter_nodes(): + for relation, other in e.iter_edges(node): + relation = safely_to_unicode(relation) + if isnode(other): + graph[other].append((u"!" + relation, node)) + else: + other = safely_to_unicode(other) + graph[node].append((relation, other)) + assert all(isnode(x) for x in graph) and len(e) == len(graph) + return graph + + +def post_order_depth_first(graph, start): + """ + Iterate over the nodes of the graph (is a tree) in a way such that every + node is preceded by it's childs. + `graph` is a dict that represents the `Expression` graph. It's a tree too + beacuse Expressions are trees. + `start` is the node to use as the root of the tree. + """ + q = [start] + seen = set() + i = 0 + while i != len(graph): + node = q[i] + seen.add(node) + i += 1 + for _, other in graph[node]: + if isnode(other) and other not in seen: + q.append(other) + assert len(q) == len(graph) + q.reverse() + return q + + +def paths_from_root(graph, start): + """ + Generates paths from `start` to every other node in `graph` and puts it in + the returned dictionary `paths`. + ie.: `paths_from_node(graph, start)[node]` is a list of the edge names used + to get to `node` form `start`. + """ + paths = {start: []} + q = [start] + seen = set() + while q: + node = q.pop() + seen.add(node) + for relation, child in graph[node]: + if isnode(child) and child not in seen: + q.append(child) + paths[child] = paths[node] + [relation] + return paths + + +def generate_mql(e): + """ + Generates a MQL query for the `Expression` `e`. + """ + start = choose_start_node(e) + graph = to_bidirected_graph(e) + generated = {} + for node in post_order_depth_first(graph, start): + d = {} + for relation, other in graph[node]: + if isnode(other): + try: + other = generated[other] + except KeyError: + continue # other is not in post_order_depth_first order + d[relation] = other + generated[node] = [d] + + mql_query = json.dumps(generated[start], sort_keys=True, + indent=2, separators=(',', ': ')) + mql_query = _tidy(mql_query) + target = paths_from_root(graph, start)[e.get_head()] + return target, mql_query + + +def _tidy(mql): + """ + Given a json representing a MQL query it collapses spaces between + braces and curly braces to make it look tidy. + """ + def replacement_function(match): + text = match.group(0) + if text.startswith("[") and text.endswith("]"): + return "[{}]" + elif text.startswith("["): + return "[{" + indent = 0 + match = re.search("}[ \t]*\n(\s*?)\]", text) + if match: + indent = len(match.group(1)) + return " " * indent + "}]" + return re.sub("\[\s*{\s*}\s*\]|\[\s+{|[ \t]*}\s+\]", + replacement_function, mql) diff --git a/quepy/nltktagger.py b/quepy/nltktagger.py index bbd4455..15b2e11 100644 --- a/quepy/nltktagger.py +++ b/quepy/nltktagger.py @@ -25,7 +25,7 @@ def penn_to_morphy_tag(tag): assert_valid_encoding(tag) - for penn, morphy in _penn_to_morphy_tag.iteritems(): + for penn, morphy in _penn_to_morphy_tag.items(): if tag.startswith(penn): return morphy return None @@ -46,10 +46,10 @@ def run_nltktagger(string, nltk_data_path=None): if not _penn_to_morphy_tag: _penn_to_morphy_tag = { - u'NN': wordnet.NOUN, - u'JJ': wordnet.ADJ, - u'VB': wordnet.VERB, - u'RB': wordnet.ADV, + 'NN': wordnet.NOUN, + 'JJ': wordnet.ADJ, + 'VB': wordnet.VERB, + 'RB': wordnet.ADV, } # Recommended tokenizer doesn't handle non-ascii characters very well diff --git a/quepy/nltktagger.py.bak b/quepy/nltktagger.py.bak new file mode 100644 index 0000000..bbd4455 --- /dev/null +++ b/quepy/nltktagger.py.bak @@ -0,0 +1,81 @@ +# coding: utf-8 + +# Copyright (c) 2012, Machinalis S.R.L. +# This file is part of quepy and is distributed under the Modified BSD License. +# You should have received a copy of license in the LICENSE file. +# +# Authors: Rafael Carrascosa +# Gonzalo Garcia Berrotaran + +""" +Tagging using NLTK. +""" + +# Requiered data files are: +# - "averaged_perceptron_tagger" in Models +# - "wordnet" in Corpora + +import nltk +from quepy.tagger import Word +from quepy.encodingpolicy import assert_valid_encoding + +_penn_to_morphy_tag = {} + + +def penn_to_morphy_tag(tag): + assert_valid_encoding(tag) + + for penn, morphy in _penn_to_morphy_tag.iteritems(): + if tag.startswith(penn): + return morphy + return None + + +def run_nltktagger(string, nltk_data_path=None): + """ + Runs nltk tagger on `string` and returns a list of + :class:`quepy.tagger.Word` objects. + """ + assert_valid_encoding(string) + global _penn_to_morphy_tag + + if nltk_data_path: + nltk.data.path = nltk_data_path + + from nltk.corpus import wordnet + + if not _penn_to_morphy_tag: + _penn_to_morphy_tag = { + u'NN': wordnet.NOUN, + u'JJ': wordnet.ADJ, + u'VB': wordnet.VERB, + u'RB': wordnet.ADV, + } + + # Recommended tokenizer doesn't handle non-ascii characters very well + #tokens = nltk.word_tokenize(string) + tokens = nltk.wordpunct_tokenize(string) + tags = nltk.pos_tag(tokens) + + words = [] + for token, pos in tags: + word = Word(token) + # Eliminates stuff like JJ|CC + # decode ascii because they are the penn-like POS tags (are ascii). + word.pos = pos.split("|")[0].decode("ascii") + + mtag = penn_to_morphy_tag(word.pos) + # Nice shooting, son. What's your name? + lemma = wordnet.morphy(word.token, pos=mtag) + if isinstance(lemma, str): + # In this case lemma is example-based, because if it's rule based + # the result should be unicode (input was unicode). + # Since english is ascii the decoding is ok. + lemma = lemma.decode("ascii") + word.lemma = lemma + if word.lemma is None: + word.lemma = word.token.lower() + + words.append(word) + + return words diff --git a/quepy/quepyapp.py b/quepy/quepyapp.py index e3187d0..415c485 100644 --- a/quepy/quepyapp.py +++ b/quepy/quepyapp.py @@ -30,16 +30,16 @@ def install(app_name): """ module_paths = { - u"settings": u"{0}.settings", - u"parsing": u"{0}", + "settings": "{0}.settings", + "parsing": "{0}", } modules = {} - for module_name, module_path in module_paths.iteritems(): + for module_name, module_path in module_paths.items(): try: modules[module_name] = import_module(module_path.format(app_name)) - except ImportError, error: - message = u"Error importing {0!r}: {1}" + except ImportError as error: + message = "Error importing {0!r}: {1}" raise ImportError(message.format(module_name, error)) return QuepyApp(**modules) @@ -122,10 +122,10 @@ def get_queries(self, question): question = encoding_flexible_conversion(question) for expression, userdata in self._iter_compiled_forms(question): target, query = generation.get_code(expression, self.language) - message = u"Interpretation {1}: {0}" + message = "Interpretation {1}: {0}" logger.debug(message.format(str(expression), expression.rule_used)) - logger.debug(u"Query generated: {0}".format(query)) + logger.debug("Query generated: {0}".format(query)) yield target, query, userdata def _iter_compiled_forms(self, question): @@ -136,12 +136,12 @@ def _iter_compiled_forms(self, question): try: words = list(self.tagger(question)) except TaggingError: - logger.warning(u"Can't parse tagger's output for: '%s'", + logger.warning("Can't parse tagger's output for: '%s'", question) return - logger.debug(u"Tagged question:\n" + - u"\n".join(u"\t{}".format(w for w in words))) + logger.debug("Tagged question:\n" + + "\n".join("\t{}".format(w for w in words))) for rule in self.rules: expression, userdata = rule.get_interpretation(words) diff --git a/quepy/quepyapp.py.bak b/quepy/quepyapp.py.bak new file mode 100644 index 0000000..e3187d0 --- /dev/null +++ b/quepy/quepyapp.py.bak @@ -0,0 +1,162 @@ +# coding: utf-8 + +# Copyright (c) 2012, Machinalis S.R.L. +# This file is part of quepy and is distributed under the Modified BSD License. +# You should have received a copy of license in the LICENSE file. +# +# Authors: Rafael Carrascosa +# Gonzalo Garcia Berrotaran + +""" +Implements the Quepy Application API +""" + +import logging +from importlib import import_module +from types import ModuleType + +from quepy import settings +from quepy import generation +from quepy.parsing import QuestionTemplate +from quepy.tagger import get_tagger, TaggingError +from quepy.encodingpolicy import encoding_flexible_conversion + +logger = logging.getLogger("quepy.quepyapp") + + +def install(app_name): + """ + Installs the application and gives an QuepyApp object + """ + + module_paths = { + u"settings": u"{0}.settings", + u"parsing": u"{0}", + } + modules = {} + + for module_name, module_path in module_paths.iteritems(): + try: + modules[module_name] = import_module(module_path.format(app_name)) + except ImportError, error: + message = u"Error importing {0!r}: {1}" + raise ImportError(message.format(module_name, error)) + + return QuepyApp(**modules) + + +def question_sanitize(question): + question = question.replace("'", "\'") + question = question.replace("\"", "\\\"") + return question + + +class QuepyApp(object): + """ + Provides the quepy application API. + """ + + def __init__(self, parsing, settings): + """ + Creates the application based on `parsing`, `settings` modules. + """ + + assert isinstance(parsing, ModuleType) + assert isinstance(settings, ModuleType) + + self._parsing_module = parsing + self._settings_module = settings + + # Save the settings right after loading settings module + self._save_settings_values() + + self.tagger = get_tagger() + self.language = getattr(self._settings_module, "LANGUAGE", None) + if not self.language: + raise ValueError("Missing configuration for language") + + self.rules = [] + for element in dir(self._parsing_module): + element = getattr(self._parsing_module, element) + + try: + if issubclass(element, QuestionTemplate) and \ + element is not QuestionTemplate: + + self.rules.append(element()) + except TypeError: + continue + + self.rules.sort(key=lambda x: x.weight, reverse=True) + + def get_query(self, question): + """ + Given `question` in natural language, it returns + three things: + + - the target of the query in string format + - the query + - metadata given by the regex programmer (defaults to None) + + The query returned corresponds to the first regex that matches in + weight order. + """ + + question = question_sanitize(question) + for target, query, userdata in self.get_queries(question): + return target, query, userdata + return None, None, None + + def get_queries(self, question): + """ + Given `question` in natural language, it returns + three things: + + - the target of the query in string format + - the query + - metadata given by the regex programmer (defaults to None) + + The queries returned corresponds to the regexes that match in + weight order. + """ + question = encoding_flexible_conversion(question) + for expression, userdata in self._iter_compiled_forms(question): + target, query = generation.get_code(expression, self.language) + message = u"Interpretation {1}: {0}" + logger.debug(message.format(str(expression), + expression.rule_used)) + logger.debug(u"Query generated: {0}".format(query)) + yield target, query, userdata + + def _iter_compiled_forms(self, question): + """ + Returns all the compiled form of the question. + """ + + try: + words = list(self.tagger(question)) + except TaggingError: + logger.warning(u"Can't parse tagger's output for: '%s'", + question) + return + + logger.debug(u"Tagged question:\n" + + u"\n".join(u"\t{}".format(w for w in words))) + + for rule in self.rules: + expression, userdata = rule.get_interpretation(words) + if expression: + yield expression, userdata + + def _save_settings_values(self): + """ + Persists the settings values of the app to the settings module + so it can be accesible from another part of the software. + """ + + for key in dir(self._settings_module): + if key.upper() == key: + value = getattr(self._settings_module, key) + if isinstance(value, str): + value = encoding_flexible_conversion(value) + setattr(settings, key, value) diff --git a/quepy/settings.py b/quepy/settings.py index e69c1ab..b7e3e07 100644 --- a/quepy/settings.py +++ b/quepy/settings.py @@ -21,7 +21,7 @@ DEFAULT_ENCODING = "utf-8" # Sparql config -SPARQL_PREAMBLE = u""" +SPARQL_PREAMBLE = """ PREFIX owl: PREFIX rdfs: PREFIX rdf: diff --git a/quepy/settings.py.bak b/quepy/settings.py.bak new file mode 100644 index 0000000..e69c1ab --- /dev/null +++ b/quepy/settings.py.bak @@ -0,0 +1,31 @@ +# coding: utf-8 + +# Copyright (c) 2012, Machinalis S.R.L. +# This file is part of quepy and is distributed under the Modified BSD License. +# You should have received a copy of license in the LICENSE file. +# +# Authors: Rafael Carrascosa +# Gonzalo Garcia Berrotaran + +""" +Settings. +""" + +# Generated query language +LANGUAGE = "sparql" + +# NLTK config +NLTK_DATA_PATH = [] # List of paths with NLTK data + +# Encoding config +DEFAULT_ENCODING = "utf-8" + +# Sparql config +SPARQL_PREAMBLE = u""" +PREFIX owl: +PREFIX rdfs: +PREFIX rdf: +PREFIX foaf: +PREFIX skos: +PREFIX quepy: +""" diff --git a/quepy/sparql_generation.py b/quepy/sparql_generation.py index 3b1a218..a7403a6 100644 --- a/quepy/sparql_generation.py +++ b/quepy/sparql_generation.py @@ -9,11 +9,11 @@ from quepy.expression import isnode from quepy.encodingpolicy import assert_valid_encoding -_indent = u" " +_indent = " " def escape(string): - string = unicode(string) + string = str(string) string = string.replace("\n", "") string = string.replace("\r", "") string = string.replace("\t", "") @@ -27,24 +27,24 @@ def escape(string): def adapt(x): if isnode(x): - x = u"?x{}".format(x) + x = "?x{}".format(x) return x - if isinstance(x, basestring): + if isinstance(x, str): assert_valid_encoding(x) - if x.startswith(u"\"") or ":" in x: + if x.startswith("\"") or ":" in x: return x - return u'"{}"'.format(x) - return unicode(x) + return '"{}"'.format(x) + return str(x) def expression_to_sparql(e, full=False): - template = u"{preamble}\n" +\ - u"SELECT DISTINCT {select} WHERE {{\n" +\ - u"{expression}\n" +\ - u"}}\n" + template = "{preamble}\n" +\ + "SELECT DISTINCT {select} WHERE {{\n" +\ + "{expression}\n" +\ + "}}\n" head = adapt(e.get_head()) if full: - select = u"*" + select = "*" else: select = head y = 0 @@ -52,13 +52,13 @@ def expression_to_sparql(e, full=False): for node in e.iter_nodes(): for relation, dest in e.iter_edges(node): if relation is IsRelatedTo: - relation = u"?y{}".format(y) + relation = "?y{}".format(y) y += 1 xs.append(triple(adapt(node), relation, adapt(dest), indentation=1)) sparql = template.format(preamble=settings.SPARQL_PREAMBLE, select=select, - expression=u"\n".join(xs)) + expression="\n".join(xs)) return select, sparql @@ -66,5 +66,5 @@ def triple(a, p, b, indentation=0): a = escape(a) b = escape(b) p = escape(p) - s = _indent * indentation + u"{0} {1} {2}." + s = _indent * indentation + "{0} {1} {2}." return s.format(a, p, b) diff --git a/quepy/sparql_generation.py.bak b/quepy/sparql_generation.py.bak new file mode 100644 index 0000000..3b1a218 --- /dev/null +++ b/quepy/sparql_generation.py.bak @@ -0,0 +1,70 @@ +# -*- coding: utf-8 -*- + +""" +Sparql generation code. +""" + +from quepy import settings +from quepy.dsl import IsRelatedTo +from quepy.expression import isnode +from quepy.encodingpolicy import assert_valid_encoding + +_indent = u" " + + +def escape(string): + string = unicode(string) + string = string.replace("\n", "") + string = string.replace("\r", "") + string = string.replace("\t", "") + string = string.replace("\x0b", "") + if not string or any([x for x in string if 0 < ord(x) < 31]) or \ + string.startswith(":") or string.endswith(":"): + message = "Unable to generate sparql: invalid nodes or relation" + raise ValueError(message) + return string + + +def adapt(x): + if isnode(x): + x = u"?x{}".format(x) + return x + if isinstance(x, basestring): + assert_valid_encoding(x) + if x.startswith(u"\"") or ":" in x: + return x + return u'"{}"'.format(x) + return unicode(x) + + +def expression_to_sparql(e, full=False): + template = u"{preamble}\n" +\ + u"SELECT DISTINCT {select} WHERE {{\n" +\ + u"{expression}\n" +\ + u"}}\n" + head = adapt(e.get_head()) + if full: + select = u"*" + else: + select = head + y = 0 + xs = [] + for node in e.iter_nodes(): + for relation, dest in e.iter_edges(node): + if relation is IsRelatedTo: + relation = u"?y{}".format(y) + y += 1 + xs.append(triple(adapt(node), relation, adapt(dest), + indentation=1)) + sparql = template.format(preamble=settings.SPARQL_PREAMBLE, + select=select, + expression=u"\n".join(xs)) + return select, sparql + + +def triple(a, p, b, indentation=0): + a = escape(a) + b = escape(b) + p = escape(p) + s = _indent * indentation + u"{0} {1} {2}." + return s.format(a, p, b) diff --git a/quepy/tagger.py b/quepy/tagger.py index 557e093..ff99758 100644 --- a/quepy/tagger.py +++ b/quepy/tagger.py @@ -13,7 +13,7 @@ from quepy.encodingpolicy import assert_valid_encoding logger = logging.getLogger("quepy.tagger") -PENN_TAGSET = set(u"$ `` '' ( ) , -- . : CC CD DT EX FW IN JJ JJR JJS LS MD " +PENN_TAGSET = set("$ `` '' ( ) , -- . : CC CD DT EX FW IN JJ JJR JJS LS MD " "NN NNP NNPS NNS PDT POS PRP PRP$ RB RBR RBS RP SYM TO UH " "VB VBD VBG VBN VBP VBZ WDT WP WP$ WRB".split()) @@ -31,8 +31,8 @@ class Word(object): Contains *token*, *lemma*, *pos tag* and optionally a *probability* of that tag. """ - _encoding_attrs = u"token lemma pos".split() - _attrs = _encoding_attrs + [u"prob"] + _encoding_attrs = "token lemma pos".split() + _attrs = _encoding_attrs + ["prob"] def __init__(self, token, lemma=None, pos=None, prob=None): self.pos = pos @@ -46,11 +46,11 @@ def __setattr__(self, name, value): object.__setattr__(self, name, value) def __unicode__(self): - attrs = (getattr(self, name, u"-") for name in self._attrs) - return u"|".join(str(x) for x in attrs) + attrs = (getattr(self, name, "-") for name in self._attrs) + return "|".join(str(x) for x in attrs) def __repr__(self): - return unicode(self) + return str(self) def get_tagger(): diff --git a/quepy/tagger.py.bak b/quepy/tagger.py.bak new file mode 100644 index 0000000..557e093 --- /dev/null +++ b/quepy/tagger.py.bak @@ -0,0 +1,74 @@ +# coding: utf-8 + +# Copyright (c) 2012, Machinalis S.R.L. +# This file is part of quepy and is distributed under the Modified BSD License. +# You should have received a copy of license in the LICENSE file. +# +# Authors: Rafael Carrascosa +# Gonzalo Garcia Berrotaran + +import logging + +from quepy import settings +from quepy.encodingpolicy import assert_valid_encoding + +logger = logging.getLogger("quepy.tagger") +PENN_TAGSET = set(u"$ `` '' ( ) , -- . : CC CD DT EX FW IN JJ JJR JJS LS MD " + "NN NNP NNPS NNS PDT POS PRP PRP$ RB RBR RBS RP SYM TO UH " + "VB VBD VBG VBN VBP VBZ WDT WP WP$ WRB".split()) + + +class TaggingError(Exception): + """ + Error parsing tagger's output. + """ + pass + + +class Word(object): + """ + Representation of a tagged word. + Contains *token*, *lemma*, *pos tag* and optionally a *probability* of + that tag. + """ + _encoding_attrs = u"token lemma pos".split() + _attrs = _encoding_attrs + [u"prob"] + + def __init__(self, token, lemma=None, pos=None, prob=None): + self.pos = pos + self.prob = prob + self.lemma = lemma + self.token = token + + def __setattr__(self, name, value): + if name in self._encoding_attrs and value is not None: + assert_valid_encoding(value) + object.__setattr__(self, name, value) + + def __unicode__(self): + attrs = (getattr(self, name, u"-") for name in self._attrs) + return u"|".join(str(x) for x in attrs) + + def __repr__(self): + return unicode(self) + + +def get_tagger(): + """ + Return a tagging function given some app settings. + `Settings` is the settings module of an app. + The returned value is a function that receives a unicode string and returns + a list of `Word` instances. + """ + from quepy.nltktagger import run_nltktagger + tagger_function = lambda x: run_nltktagger(x, settings.NLTK_DATA_PATH) + + def wrapper(string): + assert_valid_encoding(string) + words = tagger_function(string) + for word in words: + if word.pos not in PENN_TAGSET: + logger.warning("Tagger emmited a non-penn " + "POS tag {!r}".format(word.pos)) + return words + return wrapper diff --git a/tests/random_expression.py b/tests/random_expression.py index d223a07..520c73a 100644 --- a/tests/random_expression.py +++ b/tests/random_expression.py @@ -9,20 +9,20 @@ def random_data(only_ascii=False): while first or 1 / 20.0 < random.random(): first = False if only_ascii: - c = unichr(random.randint(33, 126)) + c = chr(random.randint(33, 126)) data.append(c) continue x = random.random() if 0.1 > x: - c = random.choice(u" ./\n") + c = random.choice(" ./\n") elif 0.50 > x: - c = unichr(random.randint(65, 122)) + c = chr(random.randint(65, 122)) elif 0.85 > x: - c = unichr(random.randint(0, 127)) + c = chr(random.randint(0, 127)) else: - c = unichr(random.randint(0, 65535)) + c = chr(random.randint(0, 65535)) data.append(c) - return u"".join(data) + return "".join(data) def random_relation(only_ascii=False): diff --git a/tests/random_expression.py.bak b/tests/random_expression.py.bak new file mode 100644 index 0000000..d223a07 --- /dev/null +++ b/tests/random_expression.py.bak @@ -0,0 +1,74 @@ +# -*- coding: utf-8 -*- +import random +from quepy.expression import Expression + + +def random_data(only_ascii=False): + data = [] + first = True + while first or 1 / 20.0 < random.random(): + first = False + if only_ascii: + c = unichr(random.randint(33, 126)) + data.append(c) + continue + x = random.random() + if 0.1 > x: + c = random.choice(u" ./\n") + elif 0.50 > x: + c = unichr(random.randint(65, 122)) + elif 0.85 > x: + c = unichr(random.randint(0, 127)) + else: + c = unichr(random.randint(0, 65535)) + data.append(c) + return u"".join(data) + + +def random_relation(only_ascii=False): + data = random_data(only_ascii) + data = data.replace(" ", "") + if random.random() > 0.05: + return data + + class UnicodeableDummy(object): + def __unicode__(self): + return data + return UnicodeableDummy() + + +def random_expression(only_ascii=False): + """ + operations: new node, add data, decapitate, merge + """ + mean_size = 20 + xs = [40.0, 30.0, 50.0, 20.0] + xs = [x * (1.0 - random.random()) for x in xs] + assert all(x != 0 for x in xs) + new_node, add_data, decapitate, _ = [x / sum(xs) for x in xs] + expressions = [Expression(), Expression(), Expression(), Expression()] + while len(expressions) != 1: + if (1.0 / mean_size) < random.random(): + # Will start to merge more and will not create new nodes + new_node = 0.0 + # Choose action + r = random.random() + if r < new_node: + # New expression + expressions.append(Expression()) + elif r < add_data + new_node: + # Add data + e = random.choice(expressions) + e.add_data(random_relation(only_ascii), random_data(only_ascii)) + elif r < decapitate + add_data + new_node: + # Decapitate + e = random.choice(expressions) + e.decapitate(random_relation(only_ascii), + reverse=(0.25 < random.random())) + elif len(expressions) != 1: + # Merge + random.shuffle(expressions) + e2 = expressions.pop() + e1 = expressions[-1] + e1 += e2 + return expressions[0] diff --git a/tests/test_dot_generation.py b/tests/test_dot_generation.py index 2b8d68f..c26db1b 100644 --- a/tests/test_dot_generation.py +++ b/tests/test_dot_generation.py @@ -38,14 +38,14 @@ class X(FixedRelation): class TestDotGeneration(unittest.TestCase): def _standard_check(self, s, e): - self.assertIsInstance(s, unicode) - vs = [u"x{}".format(i) for i in xrange(len(e))] + self.assertIsInstance(s, str) + vs = ["x{}".format(i) for i in range(len(e))] for var in vs: self.assertIn(var, s) def test_dot_takes_unicode(self): - e = gen_fixedtype(u"·̣─@łæßð~¶½") - e += gen_datarel(u"tµŧurułej€", u"←ðßðæßđæßæđßŋŋæ @~~·ŋŋ·¶·ŋ“¶¬@@") + e = gen_fixedtype("·̣─@łæßð~¶½") + e += gen_datarel("tµŧurułej€", "←ðßðæßđæßæđßŋŋæ @~~·ŋŋ·¶·ŋ“¶¬@@") _, s = expression_to_dot(e) self._standard_check(s, e) @@ -66,7 +66,7 @@ def test_dot_stress(self): dot_file = tempfile.NamedTemporaryFile() cmdline = "dot %s" % dot_file.name msg = "dot returned error code {}, check {} input file." - for _ in xrange(100): + for _ in range(100): expression = random_expression() _, dot_string = expression_to_dot(expression) with open(dot_file.name, "w") as filehandler: @@ -76,7 +76,7 @@ def test_dot_stress(self): retcode = subprocess.call(cmdline.split(), stdout=tempfile.TemporaryFile()) except OSError: - print "Warning: the program 'dot' was not found, tests skipped" + print("Warning: the program 'dot' was not found, tests skipped") return if retcode != 0: dot_file.delete = False diff --git a/tests/test_dot_generation.py.bak b/tests/test_dot_generation.py.bak new file mode 100644 index 0000000..2b8d68f --- /dev/null +++ b/tests/test_dot_generation.py.bak @@ -0,0 +1,87 @@ +# -*- coding: utf-8 -*- + +# Copyright (c) 2012, Machinalis S.R.L. +# This file is part of quepy and is distributed under the Modified BSD License. +# You should have received a copy of license in the LICENSE file. +# +# Authors: Rafael Carrascosa +# Gonzalo Garcia Berrotaran + +import unittest +import tempfile +import subprocess +from random_expression import random_expression +from random import seed +from quepy.dot_generation import expression_to_dot +from quepy.dsl import FixedRelation, FixedType, \ + FixedDataRelation + + +def gen_datarel(rel, data): + class X(FixedDataRelation): + relation = rel + return X(data) + + +def gen_fixedtype(type_): + class X(FixedType): + fixedtype = type_ + return X() + + +def gen_fixedrelation(rel, e): + class X(FixedRelation): + relation = rel + return X(e) + + +class TestDotGeneration(unittest.TestCase): + + def _standard_check(self, s, e): + self.assertIsInstance(s, unicode) + vs = [u"x{}".format(i) for i in xrange(len(e))] + for var in vs: + self.assertIn(var, s) + + def test_dot_takes_unicode(self): + e = gen_fixedtype(u"·̣─@łæßð~¶½") + e += gen_datarel(u"tµŧurułej€", u"←ðßðæßđæßæđßŋŋæ @~~·ŋŋ·¶·ŋ“¶¬@@") + _, s = expression_to_dot(e) + self._standard_check(s, e) + + def test_dot_takes_fails_ascii1(self): + e = gen_fixedtype("a") + e += gen_datarel("b", "c") + e = gen_fixedrelation("d", e) + self.assertRaises(ValueError, expression_to_dot, e) + + def test_dot_takes_fails_ascii2(self): + e = gen_fixedtype("·̣─@łæßð~¶½") + e += gen_datarel("tµŧurułej€", "←ðßðæßđæßæđßŋŋæ @~~·ŋŋ·¶·ŋ“¶¬@@") + self.assertRaises(ValueError, expression_to_dot, e) + + def test_dot_stress(self): + seed("I have come here to chew bubblegum and kick ass... " + "and I'm all out of bubblegum.") + dot_file = tempfile.NamedTemporaryFile() + cmdline = "dot %s" % dot_file.name + msg = "dot returned error code {}, check {} input file." + for _ in xrange(100): + expression = random_expression() + _, dot_string = expression_to_dot(expression) + with open(dot_file.name, "w") as filehandler: + filehandler.write(dot_string.encode("utf-8")) + + try: + retcode = subprocess.call(cmdline.split(), + stdout=tempfile.TemporaryFile()) + except OSError: + print "Warning: the program 'dot' was not found, tests skipped" + return + if retcode != 0: + dot_file.delete = False + self.assertEqual(retcode, 0, msg.format(retcode, dot_file.name)) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_dsl.py b/tests/test_dsl.py index 792fc91..271f580 100644 --- a/tests/test_dsl.py +++ b/tests/test_dsl.py @@ -17,7 +17,7 @@ class TestDSL(unittest.TestCase): def test_fixed_relation(self): class MyFixedRelation(FixedRelation): - relation = u"uranium:blowtorch" + relation = "uranium:blowtorch" empty = Expression() fixedinstance = MyFixedRelation(empty) @@ -25,13 +25,13 @@ class MyFixedRelation(FixedRelation): head = fixedinstance.get_head() relations = [x[0] for x in fixedinstance.iter_edges(head)] - self.assertIn(u"uranium:blowtorch", relations) + self.assertIn("uranium:blowtorch", relations) def test_fixed_type(self): class MyFixedType(FixedType): - fixedtype = u"uranium:blowtorch" - fixedtyperelation = u"rdf:type" + fixedtype = "uranium:blowtorch" + fixedtyperelation = "rdf:type" fixedinstance = MyFixedType() @@ -39,38 +39,38 @@ class MyFixedType(FixedType): edges = list(fixedinstance.iter_edges(head)) self.assertEqual(len(edges), 1) - self.assertIsInstance(edges[0][0], unicode) - self.assertEqual(edges[0][0], u"rdf:type") - self.assertIsInstance(edges[0][1], unicode) - self.assertEqual(edges[0][1], u"uranium:blowtorch") + self.assertIsInstance(edges[0][0], str) + self.assertEqual(edges[0][0], "rdf:type") + self.assertIsInstance(edges[0][1], str) + self.assertEqual(edges[0][1], "uranium:blowtorch") def test_fixed_data_relation(self): class MyFixedDataRelation(FixedDataRelation): - relation = u"uranium:blowtorch" + relation = "uranium:blowtorch" - fixedinstance = MyFixedDataRelation(u"soplete") + fixedinstance = MyFixedDataRelation("soplete") head = fixedinstance.get_head() edges = list(fixedinstance.iter_edges(head)) self.assertEqual(len(edges), 1) - self.assertIsInstance(edges[0][0], unicode) - self.assertEqual(edges[0][0], u"uranium:blowtorch") - self.assertIsInstance(edges[0][1], unicode) - self.assertEqual(edges[0][1], u"soplete") + self.assertIsInstance(edges[0][0], str) + self.assertEqual(edges[0][0], "uranium:blowtorch") + self.assertIsInstance(edges[0][1], str) + self.assertEqual(edges[0][1], "soplete") def test_has_keyword(self): - HasKeyword.relation = u"uranium:keyword" - keywordinstance = HasKeyword(u"soplete") + HasKeyword.relation = "uranium:keyword" + keywordinstance = HasKeyword("soplete") head = keywordinstance.get_head() edges = list(keywordinstance.iter_edges(head)) self.assertEqual(len(edges), 1) - self.assertIsInstance(edges[0][0], unicode) - self.assertEqual(edges[0][0], u"uranium:keyword") - self.assertIsInstance(edges[0][1], unicode) - self.assertEqual(edges[0][1], u'soplete') + self.assertIsInstance(edges[0][0], str) + self.assertEqual(edges[0][0], "uranium:keyword") + self.assertIsInstance(edges[0][1], str) + self.assertEqual(edges[0][1], 'soplete') # With language HasKeyword.language = "en" @@ -79,18 +79,18 @@ def test_has_keyword(self): head = keywordinstance.get_head() edges = list(keywordinstance.iter_edges(head)) self.assertEqual(len(edges), 1) - self.assertIsInstance(edges[0][1], unicode) - self.assertEqual(edges[0][1], u'"soplete"@en') + self.assertIsInstance(edges[0][1], str) + self.assertEqual(edges[0][1], '"soplete"@en') # With sanitize HasKeyword.sanitize = staticmethod(lambda x: x.upper()) - keywordinstance = HasKeyword(u"soplete") + keywordinstance = HasKeyword("soplete") head = keywordinstance.get_head() edges = list(keywordinstance.iter_edges(head)) self.assertEqual(len(edges), 1) - self.assertIsInstance(edges[0][1], unicode) - self.assertEqual(edges[0][1], u'"SOPLETE"@en') + self.assertIsInstance(edges[0][1], str) + self.assertEqual(edges[0][1], '"SOPLETE"@en') if __name__ == "__main__": diff --git a/tests/test_dsl.py.bak b/tests/test_dsl.py.bak new file mode 100644 index 0000000..792fc91 --- /dev/null +++ b/tests/test_dsl.py.bak @@ -0,0 +1,97 @@ +# coding: utf-8 + +# Copyright (c) 2012, Machinalis S.R.L. +# This file is part of quepy and is distributed under the Modified BSD License. +# You should have received a copy of license in the LICENSE file. +# +# Authors: Rafael Carrascosa +# Gonzalo Garcia Berrotaran + +import unittest +from quepy.expression import Expression +from quepy.dsl import HasKeyword, FixedRelation, FixedType, \ + FixedDataRelation + + +class TestDSL(unittest.TestCase): + def test_fixed_relation(self): + + class MyFixedRelation(FixedRelation): + relation = u"uranium:blowtorch" + + empty = Expression() + fixedinstance = MyFixedRelation(empty) + + head = fixedinstance.get_head() + relations = [x[0] for x in fixedinstance.iter_edges(head)] + + self.assertIn(u"uranium:blowtorch", relations) + + def test_fixed_type(self): + + class MyFixedType(FixedType): + fixedtype = u"uranium:blowtorch" + fixedtyperelation = u"rdf:type" + + fixedinstance = MyFixedType() + + head = fixedinstance.get_head() + edges = list(fixedinstance.iter_edges(head)) + + self.assertEqual(len(edges), 1) + self.assertIsInstance(edges[0][0], unicode) + self.assertEqual(edges[0][0], u"rdf:type") + self.assertIsInstance(edges[0][1], unicode) + self.assertEqual(edges[0][1], u"uranium:blowtorch") + + def test_fixed_data_relation(self): + + class MyFixedDataRelation(FixedDataRelation): + relation = u"uranium:blowtorch" + + fixedinstance = MyFixedDataRelation(u"soplete") + head = fixedinstance.get_head() + edges = list(fixedinstance.iter_edges(head)) + + self.assertEqual(len(edges), 1) + self.assertIsInstance(edges[0][0], unicode) + self.assertEqual(edges[0][0], u"uranium:blowtorch") + self.assertIsInstance(edges[0][1], unicode) + self.assertEqual(edges[0][1], u"soplete") + + def test_has_keyword(self): + + HasKeyword.relation = u"uranium:keyword" + keywordinstance = HasKeyword(u"soplete") + + head = keywordinstance.get_head() + edges = list(keywordinstance.iter_edges(head)) + self.assertEqual(len(edges), 1) + self.assertIsInstance(edges[0][0], unicode) + self.assertEqual(edges[0][0], u"uranium:keyword") + self.assertIsInstance(edges[0][1], unicode) + self.assertEqual(edges[0][1], u'soplete') + + # With language + HasKeyword.language = "en" + keywordinstance = HasKeyword("soplete") + + head = keywordinstance.get_head() + edges = list(keywordinstance.iter_edges(head)) + self.assertEqual(len(edges), 1) + self.assertIsInstance(edges[0][1], unicode) + self.assertEqual(edges[0][1], u'"soplete"@en') + + # With sanitize + HasKeyword.sanitize = staticmethod(lambda x: x.upper()) + keywordinstance = HasKeyword(u"soplete") + + head = keywordinstance.get_head() + edges = list(keywordinstance.iter_edges(head)) + self.assertEqual(len(edges), 1) + self.assertIsInstance(edges[0][1], unicode) + self.assertEqual(edges[0][1], u'"SOPLETE"@en') + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_expressions.py b/tests/test_expressions.py index 639d000..7e108d9 100644 --- a/tests/test_expressions.py +++ b/tests/test_expressions.py @@ -57,7 +57,7 @@ def test_non_empty(self): self.assertNotEqual(len(self.e), 0) def test_add_data(self): - rel = u"|@·~½" + rel = "|@·~½" data = "somedata" self.e.add_data(rel, data) xs = list(self.e.iter_edges(self.e.get_head())) @@ -121,7 +121,7 @@ def setUp(self): self.e = Expression() self.e.add_data("key", "1") self.e.add_data("key", "2") - self.e.add_data(u"~·~··@↓", None) + self.e.add_data("~·~··@↓", None) self.e.add_data(None, None) @@ -129,7 +129,7 @@ class TestExpression3(unittest.TestCase, ExpressionTests): def setUp(self): self.e = Expression() self.e.add_data("key", "1") - self.e.decapitate(u"µ") + self.e.decapitate("µ") self.e.add_data("a", "2") self.e.add_data("a", "3") self.e.add_data(None, None) @@ -145,7 +145,7 @@ def setUp(self): other.add_data(0, "1") other.add_data(2, "3") other.decapitate("iuju") - for _ in xrange(5): + for _ in range(5): self.e.decapitate("nouu") self.e += other @@ -237,14 +237,14 @@ def setUp(self): other = Expression() other.decapitate("onelevel") self.a = Expression() - for _ in xrange(5): + for _ in range(5): self.a.decapitate("step") self.a += other other = Expression() other.decapitate("onelevel", reverse=True) self.b = Expression() - for _ in xrange(5): + for _ in range(5): self.b.decapitate("step") self.b += other @@ -255,7 +255,7 @@ def setUp(self): other.add_data(0, "data") other.decapitate("onelevel") self.a = Expression() - for _ in xrange(5): + for _ in range(5): self.a.decapitate("step") self.a += other @@ -263,7 +263,7 @@ def setUp(self): other.add_data(0, "data") other.decapitate("onelevel", reverse=True) self.b = Expression() - for _ in xrange(5): + for _ in range(5): self.b.decapitate("step") self.b += other diff --git a/tests/test_expressions.py.bak b/tests/test_expressions.py.bak new file mode 100644 index 0000000..639d000 --- /dev/null +++ b/tests/test_expressions.py.bak @@ -0,0 +1,279 @@ +#!/usr/bin/env python +# coding: utf-8 + +# Copyright (c) 2012, Machinalis S.R.L. +# This file is part of quepy and is distributed under the Modified BSD License. +# You should have received a copy of license in the LICENSE file. +# +# Authors: Rafael Carrascosa +# Gonzalo Garcia Berrotaran + +""" +Tests for expressions. +""" + +import unittest +from quepy.expression import Expression, isnode + + +def make_canonical_expression(e): + i = 0 + q = [e.get_head()] + seen = set() + while i != len(q): + node = q[i] + i += 1 + assert node not in seen, "Nouuu, expression is cyclic!" + for relation, child in e.iter_edges(node): + if isnode(child): + q.append(child) + q.reverse() + canon = {} + for node in q: + childs = [] + for label, child in e.iter_edges(node): + if isnode(child): + child = canon[child] + childs.append((label, child)) + childs.sort() + canon[node] = tuple(childs) + return canon[e.get_head()] + + +class ExpressionTests(object): + def test_acyclic(self): + head = self.e.get_head() + q = [head] + seen = set() + while q: + current = q.pop() + self.assertNotIn(current, seen) + seen.add(current) + for relation, child in self.e.iter_edges(current): + if isnode(child): + q.append(child) + + def test_non_empty(self): + self.assertNotEqual(len(self.e), 0) + + def test_add_data(self): + rel = u"|@·~½" + data = "somedata" + self.e.add_data(rel, data) + xs = list(self.e.iter_edges(self.e.get_head())) + self.assertIn((rel, data), xs) + + def test_decapitate(self): + oldhead = self.e.get_head() + self.e.decapitate("blabla") + self.assertNotEqual(oldhead, self.e.get_head()) + xs = list(self.e.iter_edges(self.e.get_head())) + self.assertEqual(xs, [("blabla", oldhead)]) + + def test_merges1(self): + oldlen = len(self.e) + oldhead = self.e.get_head() + other = Expression() + other.decapitate("blabla") + self.e.merge(other) + self.assertEqual(self.e.get_head(), oldhead) + self.assertEqual(len(self.e), oldlen + len(other) - 1) + + def test_merges2(self): + other = Expression() + other.decapitate("blabla") + oldlen = len(other) + oldhead = other.get_head() + other.merge(self.e) + self.assertEqual(other.get_head(), oldhead) + self.assertEqual(len(other), oldlen + len(self.e) - 1) + + def test_plus_makes_copy(self): + other = Expression() + other.decapitate("blabla") + a = self.e + other + self.assertFalse(a is other or self.e is other or a is self.e) + + def test_plus_is_conmutative(self): + other = Expression() + other.decapitate("blabla") + a = self.e + other + b = other + self.e + self.assertEqual(make_canonical_expression(a), + make_canonical_expression(b)) + + def test_plus_is_conmutative2(self): + other = Expression() + other.decapitate("blabla") + a = self.e + other + self.e + b = other + self.e + self.e + self.assertEqual(make_canonical_expression(a), + make_canonical_expression(b)) + + +class TestExpression1(unittest.TestCase, ExpressionTests): + def setUp(self): + self.e = Expression() + + +class TestExpression2(unittest.TestCase, ExpressionTests): + def setUp(self): + self.e = Expression() + self.e.add_data("key", "1") + self.e.add_data("key", "2") + self.e.add_data(u"~·~··@↓", None) + self.e.add_data(None, None) + + +class TestExpression3(unittest.TestCase, ExpressionTests): + def setUp(self): + self.e = Expression() + self.e.add_data("key", "1") + self.e.decapitate(u"µ") + self.e.add_data("a", "2") + self.e.add_data("a", "3") + self.e.add_data(None, None) + self.e.decapitate(None) + self.e.add_data(None, None) + + +class TestExpression4(unittest.TestCase, ExpressionTests): + def setUp(self): + self.e = Expression() + self.e.add_data(123, "456") + other = Expression() + other.add_data(0, "1") + other.add_data(2, "3") + other.decapitate("iuju") + for _ in xrange(5): + self.e.decapitate("nouu") + self.e += other + + +class CanonEqualTest(object): + def test_are_the_same(self): + a = make_canonical_expression(self.a) + b = make_canonical_expression(self.b) + self.assertEqual(a, b) + + +class CanonNotEqualTest(object): + def test_are_the_same(self): + a = make_canonical_expression(self.a) + b = make_canonical_expression(self.b) + self.assertNotEqual(a, b) + + +class TestCanon1(unittest.TestCase, CanonEqualTest): + def setUp(self): + self.a = Expression() + self.b = Expression() + + +class TestCanon2(unittest.TestCase, CanonEqualTest): + def setUp(self): + self.a = Expression() + self.a.add_data(None, "1") + self.a.add_data(None, "2") + self.b = Expression() + self.b.add_data(None, "2") + self.b.add_data(None, "1") + + +class TestCanon3(unittest.TestCase, CanonEqualTest): + def setUp(self): + A = Expression() + A.add_data("bla", "somedata") + A.decapitate("hier") + B = Expression() + B.add_data("ble", "otherdata") + B.decapitate("hier") + self.a = A + B + self.b = B + A + + +class TestCanon4(unittest.TestCase, CanonEqualTest): + def setUp(self): + A = Expression() + A.add_data("bla", "somedata") + A.decapitate("hier") + B = Expression() + B.add_data("ble", "otherdata") + B.decapitate("hier") + C = A + B + C.decapitate("hier") + C += B + C.decapitate("hier") + self.a = C + A + D = B + A + D.decapitate("hier") + D += B + D.decapitate("hier") + self.b = D + A + + +class TestCanon95(unittest.TestCase, CanonNotEqualTest): + def setUp(self): + self.a = Expression() + self.a.decapitate("onelevel") + + self.b = Expression() + self.b.decapitate("onelevel", reverse=True) + + +class TestCanon96(unittest.TestCase, CanonNotEqualTest): + def setUp(self): + self.a = Expression() + self.a.add_data(0, "data") + self.a.decapitate("onelevel") + + self.b = Expression() + self.b.add_data(0, "data") + self.b.decapitate("onelevel", reverse=True) + + +class TestCanon97(unittest.TestCase, CanonNotEqualTest): + def setUp(self): + other = Expression() + other.decapitate("onelevel") + self.a = Expression() + for _ in xrange(5): + self.a.decapitate("step") + self.a += other + + other = Expression() + other.decapitate("onelevel", reverse=True) + self.b = Expression() + for _ in xrange(5): + self.b.decapitate("step") + self.b += other + + +class TestCanon98(unittest.TestCase, CanonNotEqualTest): + def setUp(self): + other = Expression() + other.add_data(0, "data") + other.decapitate("onelevel") + self.a = Expression() + for _ in xrange(5): + self.a.decapitate("step") + self.a += other + + other = Expression() + other.add_data(0, "data") + other.decapitate("onelevel", reverse=True) + self.b = Expression() + for _ in xrange(5): + self.b.decapitate("step") + self.b += other + + +class TestCanon99(unittest.TestCase, CanonNotEqualTest): + def setUp(self): + self.a = Expression() + self.b = Expression() + self.b.decapitate("relation") + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_mql_generation.py b/tests/test_mql_generation.py index d437ce1..f0c2e86 100644 --- a/tests/test_mql_generation.py +++ b/tests/test_mql_generation.py @@ -34,16 +34,16 @@ def _valid_mql_query(self, query): if isinstance(x, list): self.assertIsInstance(x[0], dict) self.assertEqual(len(x), 1) - for key, value in x[0].iteritems(): - self.assertIsInstance(key, unicode) + for key, value in x[0].items(): + self.assertIsInstance(key, str) q.append(value) else: - self.assertIsInstance(x, unicode) + self.assertIsInstance(x, str) def _valid_target_for_query(self, target, query): self.assertIsInstance(target, list) for entry in target: - self.assertIsInstance(entry, unicode) + self.assertIsInstance(entry, str) x = self._get_json(query) if x is None: return @@ -58,7 +58,7 @@ def _valid_target_for_query(self, target, query): def test_mql_stress(self): seed("playadito vs amanda... 3 focas") - for _ in xrange(100): + for _ in range(100): expression = random_expression() target, mql = generate_mql(expression) self._valid_mql_query(mql) diff --git a/tests/test_mql_generation.py.bak b/tests/test_mql_generation.py.bak new file mode 100644 index 0000000..d437ce1 --- /dev/null +++ b/tests/test_mql_generation.py.bak @@ -0,0 +1,68 @@ +# -*- coding: utf-8 -*- + +# Copyright (c) 2012, Machinalis S.R.L. +# This file is part of quepy and is distributed under the Modified BSD License. +# You should have received a copy of license in the LICENSE file. +# +# Authors: Rafael Carrascosa +# Gonzalo Garcia Berrotaran + +import json +from random import seed +import unittest +from random_expression import random_expression +from quepy.mql_generation import generate_mql + + +class TestMqlGeneration(unittest.TestCase): + def _get_json(self, query): + try: + return json.loads(query) + except ValueError as e: + if "Unpaired" in str(e) and "surrogate" in str(e): + # This is a known issue python's json. + return None + + def _valid_mql_query(self, query): + x = self._get_json(query) + if x is None: + return + q = [x] + while q: + x = q.pop() + # Each entry is either a [{...}] or a unicode + if isinstance(x, list): + self.assertIsInstance(x[0], dict) + self.assertEqual(len(x), 1) + for key, value in x[0].iteritems(): + self.assertIsInstance(key, unicode) + q.append(value) + else: + self.assertIsInstance(x, unicode) + + def _valid_target_for_query(self, target, query): + self.assertIsInstance(target, list) + for entry in target: + self.assertIsInstance(entry, unicode) + x = self._get_json(query) + if x is None: + return + target = list(target) + while target: + entry = target.pop(0) + x = x[0][entry] + self.assertIsInstance(x, list) + self.assertEqual(len(x), 1) + self.assertIsInstance(x[0], dict) + #self.assertEqual(len(x[0]), 0) # Too strict? + + def test_mql_stress(self): + seed("playadito vs amanda... 3 focas") + for _ in xrange(100): + expression = random_expression() + target, mql = generate_mql(expression) + self._valid_mql_query(mql) + self._valid_target_for_query(target, mql) + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_nltktagger.py b/tests/test_nltktagger.py index e45e272..6d90b65 100644 --- a/tests/test_nltktagger.py +++ b/tests/test_nltktagger.py @@ -19,7 +19,7 @@ class TestNLTKTagger(unittest.TestCase): def test_word_output(self): - output = nltktagger.run_nltktagger(u"this is a test case «¢ðßæŋħħ") + output = nltktagger.run_nltktagger("this is a test case «¢ðßæŋħħ") self.assertIsInstance(output, list) for word in output: diff --git a/tests/test_nltktagger.py.bak b/tests/test_nltktagger.py.bak new file mode 100644 index 0000000..e45e272 --- /dev/null +++ b/tests/test_nltktagger.py.bak @@ -0,0 +1,30 @@ +#!/usr/bin/env python +# coding: utf-8 + +# Copyright (c) 2012, Machinalis S.R.L. +# This file is part of quepy and is distributed under the Modified BSD License. +# You should have received a copy of license in the LICENSE file. +# +# Authors: Rafael Carrascosa +# Gonzalo Garcia Berrotaran + +""" +Tests for nltktagger. +""" + +import unittest +from quepy import nltktagger +from quepy.tagger import Word + + +class TestNLTKTagger(unittest.TestCase): + def test_word_output(self): + output = nltktagger.run_nltktagger(u"this is a test case «¢ðßæŋħħ") + + self.assertIsInstance(output, list) + for word in output: + self.assertIsInstance(word, Word) + + def tests_wrong_input(self): + self.assertRaises(ValueError, nltktagger.run_nltktagger, + "this is not unicode") diff --git a/tests/test_parsing.py b/tests/test_parsing.py index b58742d..0918172 100644 --- a/tests/test_parsing.py +++ b/tests/test_parsing.py @@ -26,13 +26,13 @@ def setUp(self): self.mockrule = Mockrule class SomeRegex(QuestionTemplate): - regex = Lemma(u"hello") + regex = Lemma("hello") def interpret(self, match): return Mockrule class SomeRegexWithData(QuestionTemplate): - regex = Lemma(u"hello") + regex = Lemma("hello") def interpret(self, match): return Mockrule, 42 @@ -41,28 +41,28 @@ def interpret(self, match): self.regex_with_data = SomeRegexWithData() def test_match(self): - words = [Word(u"hi", u"hello")] + words = [Word("hi", "hello")] ir, userdata = self.regexinstance.get_interpretation(words) self.assertTrue(ir is self.mockrule) self.assertEqual(userdata, None) def test_no_match(self): - words = [Word(u"hi", u"hello"), Word(u"girl", u"girl")] + words = [Word("hi", "hello"), Word("girl", "girl")] ir, userdata = self.regexinstance.get_interpretation(words) self.assertEqual(ir, None) self.assertEqual(userdata, None) def test_user_data(self): - words = [Word(u"hi", u"hello")] + words = [Word("hi", "hello")] _, userdata = self.regex_with_data.get_interpretation(words) self.assertEqual(userdata, 42) def test_no_ir(self): class SomeRegex(QuestionTemplate): - regex = Lemma(u"hello") + regex = Lemma("hello") regexinstance = SomeRegex() - words = [Word(u"hi", u"hello")] + words = [Word("hi", "hello")] self.assertRaises(NotImplementedError, regexinstance.get_interpretation, words) @@ -72,7 +72,7 @@ def interpret(self, match): return Mockrule, "YES!" regexinstance = SomeRegex() - words = [Word(u"hi", u"hello")] + words = [Word("hi", "hello")] ir, userdata = regexinstance.get_interpretation(words) self.assertTrue(ir is Mockrule) self.assertEqual(userdata, "YES!") @@ -82,7 +82,7 @@ class SomeRegex(QuestionTemplate): def interpret(self, match): return match - words = [Word(u"|@€đ€łł@ð«|µnþ", u"hello"), Word(u"a", u"b", u"c")] + words = [Word("|@€đ€łł@ð«|µnþ", "hello"), Word("a", "b", "c")] match, _ = SomeRegex().get_interpretation(words) self.assertEqual(words, match.words) @@ -90,35 +90,35 @@ def interpret(self, match): class TestParticle(unittest.TestCase): def setUp(self): class Person(Particle): - regex = Lemma(u"Jim") | Lemma(u"Tonny") + regex = Lemma("Jim") | Lemma("Tonny") def interpret(self, match): return match class PersonRegex(QuestionTemplate): - regex = Person() + Lemma(u"be") + Person(u"another") + regex = Person() + Lemma("be") + Person("another") def interpret(self, match): return match class PersonAsset(Person): - regex = Person() + Lemma(u"'s") + Lemma(u"car") + regex = Person() + Lemma("'s") + Lemma("car") class NestedParticleRegex(PersonRegex): - regex = PersonAsset() + Lemma(u"be") + Person(u"another") + regex = PersonAsset() + Lemma("be") + Person("another") self.personregex = PersonRegex() self.nestedregex = NestedParticleRegex() def test_attrs(self): - words = [Word(x, x) for x in u"Jim be Tonny".split()] + words = [Word(x, x) for x in "Jim be Tonny".split()] match, _ = self.personregex.get_interpretation(words) self.assertEqual(match.another.words[0], words[-1]) self.assertEqual(match.person.words[0], words[0]) self.assertRaises(AttributeError, lambda: match.pirulo) def test_nested_particle(self): - words = [Word(x, x) for x in u"Jim 's car be Tonny".split()] + words = [Word(x, x) for x in "Jim 's car be Tonny".split()] match, _ = self.nestedregex.get_interpretation(words) self.assertEqual(match.personasset.words[0], words[0]) self.assertRaises(AttributeError, lambda: match.personasset.another) diff --git a/tests/test_parsing.py.bak b/tests/test_parsing.py.bak new file mode 100644 index 0000000..b58742d --- /dev/null +++ b/tests/test_parsing.py.bak @@ -0,0 +1,128 @@ +#!/usr/bin/env python +# coding: utf-8 + +# Copyright (c) 2012, Machinalis S.R.L. +# This file is part of quepy and is distributed under the Modified BSD License. +# You should have received a copy of license in the LICENSE file. +# +# Authors: Rafael Carrascosa +# Gonzalo Garcia Berrotaran + +""" +Tests for Regex module. +""" + +import unittest +from quepy.parsing import QuestionTemplate, Particle, Lemma +from quepy.tagger import Word + + +class Mockrule(object): + rulename = "Mock" + + +class TestQuestionTemplate(unittest.TestCase): + def setUp(self): + self.mockrule = Mockrule + + class SomeRegex(QuestionTemplate): + regex = Lemma(u"hello") + + def interpret(self, match): + return Mockrule + + class SomeRegexWithData(QuestionTemplate): + regex = Lemma(u"hello") + + def interpret(self, match): + return Mockrule, 42 + + self.regexinstance = SomeRegex() + self.regex_with_data = SomeRegexWithData() + + def test_match(self): + words = [Word(u"hi", u"hello")] + ir, userdata = self.regexinstance.get_interpretation(words) + self.assertTrue(ir is self.mockrule) + self.assertEqual(userdata, None) + + def test_no_match(self): + words = [Word(u"hi", u"hello"), Word(u"girl", u"girl")] + ir, userdata = self.regexinstance.get_interpretation(words) + self.assertEqual(ir, None) + self.assertEqual(userdata, None) + + def test_user_data(self): + words = [Word(u"hi", u"hello")] + _, userdata = self.regex_with_data.get_interpretation(words) + self.assertEqual(userdata, 42) + + def test_no_ir(self): + class SomeRegex(QuestionTemplate): + regex = Lemma(u"hello") + + regexinstance = SomeRegex() + words = [Word(u"hi", u"hello")] + self.assertRaises(NotImplementedError, + regexinstance.get_interpretation, words) + + def test_regex_empty(self): + class SomeRegex(QuestionTemplate): + def interpret(self, match): + return Mockrule, "YES!" + + regexinstance = SomeRegex() + words = [Word(u"hi", u"hello")] + ir, userdata = regexinstance.get_interpretation(words) + self.assertTrue(ir is Mockrule) + self.assertEqual(userdata, "YES!") + + def test_match_words(self): + class SomeRegex(QuestionTemplate): + def interpret(self, match): + return match + + words = [Word(u"|@€đ€łł@ð«|µnþ", u"hello"), Word(u"a", u"b", u"c")] + match, _ = SomeRegex().get_interpretation(words) + self.assertEqual(words, match.words) + + +class TestParticle(unittest.TestCase): + def setUp(self): + class Person(Particle): + regex = Lemma(u"Jim") | Lemma(u"Tonny") + + def interpret(self, match): + return match + + class PersonRegex(QuestionTemplate): + regex = Person() + Lemma(u"be") + Person(u"another") + + def interpret(self, match): + return match + + class PersonAsset(Person): + regex = Person() + Lemma(u"'s") + Lemma(u"car") + + class NestedParticleRegex(PersonRegex): + regex = PersonAsset() + Lemma(u"be") + Person(u"another") + + self.personregex = PersonRegex() + self.nestedregex = NestedParticleRegex() + + def test_attrs(self): + words = [Word(x, x) for x in u"Jim be Tonny".split()] + match, _ = self.personregex.get_interpretation(words) + self.assertEqual(match.another.words[0], words[-1]) + self.assertEqual(match.person.words[0], words[0]) + self.assertRaises(AttributeError, lambda: match.pirulo) + + def test_nested_particle(self): + words = [Word(x, x) for x in u"Jim 's car be Tonny".split()] + match, _ = self.nestedregex.get_interpretation(words) + self.assertEqual(match.personasset.words[0], words[0]) + self.assertRaises(AttributeError, lambda: match.personasset.another) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_quepyapp.py b/tests/test_quepyapp.py index 7beb106..2becf04 100644 --- a/tests/test_quepyapp.py +++ b/tests/test_quepyapp.py @@ -26,8 +26,8 @@ def test_get_query_types(self): question = "What is this?" target, query, userdata = self.app.get_query(question) - self.assertIsInstance(target, unicode) - self.assertIsInstance(query, unicode) + self.assertIsInstance(target, str) + self.assertIsInstance(query, str) def test_get_user_data(self): question = "user data" diff --git a/tests/test_quepyapp.py.bak b/tests/test_quepyapp.py.bak new file mode 100644 index 0000000..7beb106 --- /dev/null +++ b/tests/test_quepyapp.py.bak @@ -0,0 +1,48 @@ +#!/usr/bin/env python +# coding: utf-8 + +# Copyright (c) 2012, Machinalis S.R.L. +# This file is part of quepy and is distributed under the Modified BSD License. +# You should have received a copy of license in the LICENSE file. +# +# Authors: Rafael Carrascosa +# Gonzalo Garcia Berrotaran + +""" +Tests for QuepyApp. +""" + +import unittest + +import quepy + + +class TestQuepyApp(unittest.TestCase): + + def setUp(self): + self.app = quepy.install("testapp") + + def test_get_query_types(self): + question = "What is this?" + target, query, userdata = self.app.get_query(question) + + self.assertIsInstance(target, unicode) + self.assertIsInstance(query, unicode) + + def test_get_user_data(self): + question = "user data" + target, query, userdata = self.app.get_query(question) + self.assertEqual(userdata, "") + + def test_priority(self): + question = "something something" + target, query, userdata = self.app.get_query(question) + self.assertEqual(userdata, 42) + + def test_config_is_saved(self): + from quepy import settings + self.assertIn("testapp", settings.SPARQL_PREAMBLE) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_sparql_generation.py b/tests/test_sparql_generation.py index 836f7f2..e8ca1e5 100644 --- a/tests/test_sparql_generation.py +++ b/tests/test_sparql_generation.py @@ -42,8 +42,8 @@ class TestSparqlGeneration(unittest.TestCase): re.DOTALL) def _standard_check(self, s, e): - self.assertIsInstance(s, unicode) - vs = [u"x{}".format(i) for i in xrange(len(e))] + self.assertIsInstance(s, str) + vs = ["x{}".format(i) for i in range(len(e))] for var in vs: self.assertIn(var, s) @@ -58,8 +58,8 @@ def _sparql_check(self, s): self.assertNotEqual(self._sparql_line.match(line), None, s) def test_sparql_takes_unicode(self): - e = gen_fixedtype(u"·̣─@łæßð~¶½") - e += gen_datarel(u"tµŧurułej€", u"←ðßðæßđæßæđßŋŋæ @~~·ŋŋ·¶·ŋ“¶¬@@") + e = gen_fixedtype("·̣─@łæßð~¶½") + e += gen_datarel("tµŧurułej€", "←ðßðæßđæßæđßŋŋæ @~~·ŋŋ·¶·ŋ“¶¬@@") _, s = expression_to_sparql(e) self._standard_check(s, e) self._sparql_check(s) @@ -67,7 +67,7 @@ def test_sparql_takes_unicode(self): @unittest.skip("should be fixed") def test_sparql_ascii_stress(self): seed("sacala dunga dunga dunga") - for _ in xrange(100): + for _ in range(100): expression = random_expression(only_ascii=True) _, s = expression_to_sparql(expression) self._standard_check(s, expression) @@ -75,7 +75,7 @@ def test_sparql_ascii_stress(self): def test_sparql_stress(self): seed("sacala dunga dunga dunga") - for _ in xrange(100): + for _ in range(100): expression = random_expression() try: _, s = expression_to_sparql(expression) diff --git a/tests/test_sparql_generation.py.bak b/tests/test_sparql_generation.py.bak new file mode 100644 index 0000000..836f7f2 --- /dev/null +++ b/tests/test_sparql_generation.py.bak @@ -0,0 +1,102 @@ +# -*- coding: utf-8 -*- + +# Copyright (c) 2012, Machinalis S.R.L. +# This file is part of quepy and is distributed under the Modified BSD License. +# You should have received a copy of license in the LICENSE file. +# +# Authors: Rafael Carrascosa +# Gonzalo Garcia Berrotaran + +import re +import unittest +from random_expression import random_expression +from random import seed +from quepy.sparql_generation import expression_to_sparql +from quepy.dsl import FixedRelation, FixedType, \ + FixedDataRelation + + +def gen_datarel(rel, data): + class X(FixedDataRelation): + relation = rel + return X(data) + + +def gen_fixedtype(type_): + class X(FixedType): + fixedtype = type_ + return X() + + +def gen_fixedrelation(rel, e): + class X(FixedRelation): + relation = rel + return X(e) + + +class TestSparqlGeneration(unittest.TestCase): + + _sparql_line = re.compile("\?x\d+ \S+ (?:\?x\d+|\".*\"|\S+?:\S+?)" + "(?:@\w+)?.", re.DOTALL) + _sparql_query_start = re.compile("SELECT DISTINCT .+ WHERE {(.+)}", + re.DOTALL) + + def _standard_check(self, s, e): + self.assertIsInstance(s, unicode) + vs = [u"x{}".format(i) for i in xrange(len(e))] + for var in vs: + self.assertIn(var, s) + + def _sparql_check(self, s): + m = self._sparql_query_start.search(s) + self.assertNotEqual(m, None, "Could not find query start ") + lines = m.group(1).split("\n") + for line in lines: + line = line.strip() + if line: + s = "Line out of format: {!r}\n".format(line) + self.assertNotEqual(self._sparql_line.match(line), None, s) + + def test_sparql_takes_unicode(self): + e = gen_fixedtype(u"·̣─@łæßð~¶½") + e += gen_datarel(u"tµŧurułej€", u"←ðßðæßđæßæđßŋŋæ @~~·ŋŋ·¶·ŋ“¶¬@@") + _, s = expression_to_sparql(e) + self._standard_check(s, e) + self._sparql_check(s) + + @unittest.skip("should be fixed") + def test_sparql_ascii_stress(self): + seed("sacala dunga dunga dunga") + for _ in xrange(100): + expression = random_expression(only_ascii=True) + _, s = expression_to_sparql(expression) + self._standard_check(s, expression) + self._sparql_check(s) + + def test_sparql_stress(self): + seed("sacala dunga dunga dunga") + for _ in xrange(100): + expression = random_expression() + try: + _, s = expression_to_sparql(expression) + except ValueError as error: + if "Unable to generate sparql" in str(error): + continue + + self._standard_check(s, expression) + self._sparql_check(s) + + def test_sparql_takes_fails_ascii1(self): + e = gen_fixedtype("a") + e += gen_datarel("b", "c") + e = gen_fixedrelation("d", e) + self.assertRaises(ValueError, expression_to_sparql, e) + + def test_sparql_takes_fails_ascii2(self): + e = gen_fixedtype("·̣─@łæßð~¶½") + e += gen_datarel("tµŧurułej€", "←ðßðæßđæßæđßŋŋæ @~~·ŋŋ·¶·ŋ“¶¬@@") + self.assertRaises(ValueError, expression_to_sparql, e) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_tagger.py b/tests/test_tagger.py index 39be54a..a339175 100644 --- a/tests/test_tagger.py +++ b/tests/test_tagger.py @@ -19,45 +19,45 @@ class TestTagger(unittest.TestCase): def test_tagset_unicode(self): for tag in tagger.PENN_TAGSET: - self.assertIsInstance(tag, unicode) + self.assertIsInstance(tag, str) def test_word_encoding(self): - word = tagger.Word(token=u"æßđħłłþłłł@æµß", - lemma=u"ŧłþłßæ#¶ŋħ~#~@", - pos=u"øĸŋøħþ€ĸłþ€øæ«»¢") + word = tagger.Word(token="æßđħłłþłłł@æµß", + lemma="ŧłþłßæ#¶ŋħ~#~@", + pos="øĸŋøħþ€ĸłþ€øæ«»¢") - self.assertIsInstance(word.token, unicode) - self.assertEqual(word.token, u"æßđħłłþłłł@æµß") - self.assertIsInstance(word.lemma, unicode) - self.assertEqual(word.lemma, u"ŧłþłßæ#¶ŋħ~#~@") - self.assertIsInstance(word.pos, unicode) - self.assertEqual(word.pos, u"øĸŋøħþ€ĸłþ€øæ«»¢") + self.assertIsInstance(word.token, str) + self.assertEqual(word.token, "æßđħłłþłłł@æµß") + self.assertIsInstance(word.lemma, str) + self.assertEqual(word.lemma, "ŧłþłßæ#¶ŋħ~#~@") + self.assertIsInstance(word.pos, str) + self.assertEqual(word.pos, "øĸŋøħþ€ĸłþ€øæ«»¢") def test_word_wrong_encoding(self): # Token not unicode self.assertRaises(ValueError, tagger.Word, "æßđħłłþłłł@æµß", - u"ŧłþłßæ#¶ŋħ~#~@", u"øĸŋøħþ€ĸłþ€øæ«»¢") + "ŧłþłßæ#¶ŋħ~#~@", "øĸŋøħþ€ĸłþ€øæ«»¢") # Lemma not unicode - self.assertRaises(ValueError, tagger.Word, u"æßđħłłþłłł@æµß", - "ŧłþłßæ#¶ŋħ~#~@", u"øĸŋøħþ€ĸłþ€øæ«»¢") + self.assertRaises(ValueError, tagger.Word, "æßđħłłþłłł@æµß", + "ŧłþłßæ#¶ŋħ~#~@", "øĸŋøħþ€ĸłþ€øæ«»¢") # Pos not unicode - self.assertRaises(ValueError, tagger.Word, u"æßđħłłþłłł@æµß", - u"ŧłþłßæ#¶ŋħ~#~@", "øĸŋøħþ€ĸłþ€øæ«»¢") + self.assertRaises(ValueError, tagger.Word, "æßđħłłþłłł@æµß", + "ŧłþłßæ#¶ŋħ~#~@", "øĸŋøħþ€ĸłþ€øæ«»¢") def test_word_attrib_set(self): - word = tagger.Word(u"æßđħłłþłłł@æµß") - word.lemma = u"ŧłþłßæ#¶ŋħ~#~@" - word.pos = u"øĸŋøħþ€ĸłþ€øæ«»¢" + word = tagger.Word("æßđħłłþłłł@æµß") + word.lemma = "ŧłþłßæ#¶ŋħ~#~@" + word.pos = "øĸŋøħþ€ĸłþ€øæ«»¢" - self.assertIsInstance(word.token, unicode) - self.assertEqual(word.token, u"æßđħłłþłłł@æµß") - self.assertIsInstance(word.lemma, unicode) - self.assertEqual(word.lemma, u"ŧłþłßæ#¶ŋħ~#~@") - self.assertIsInstance(word.pos, unicode) - self.assertEqual(word.pos, u"øĸŋøħþ€ĸłþ€øæ«»¢") + self.assertIsInstance(word.token, str) + self.assertEqual(word.token, "æßđħłłþłłł@æµß") + self.assertIsInstance(word.lemma, str) + self.assertEqual(word.lemma, "ŧłþłßæ#¶ŋħ~#~@") + self.assertIsInstance(word.pos, str) + self.assertEqual(word.pos, "øĸŋøħþ€ĸłþ€øæ«»¢") def test_word_wrong_attrib_set(self): - word = tagger.Word(u"æßđħłłþłłł@æµß") + word = tagger.Word("æßđħłłþłłł@æµß") # Token not unicode self.assertRaises(ValueError, setattr, word, "token", "æßđħłłþłłł@æµß") diff --git a/tests/test_tagger.py.bak b/tests/test_tagger.py.bak new file mode 100644 index 0000000..39be54a --- /dev/null +++ b/tests/test_tagger.py.bak @@ -0,0 +1,67 @@ +#!/usr/bin/env python +# coding: utf-8 + +# Copyright (c) 2012, Machinalis S.R.L. +# This file is part of quepy and is distributed under the Modified BSD License. +# You should have received a copy of license in the LICENSE file. +# +# Authors: Rafael Carrascosa +# Gonzalo Garcia Berrotaran + +""" +Tests for tagger. +""" + +import unittest +from quepy import tagger + + +class TestTagger(unittest.TestCase): + def test_tagset_unicode(self): + for tag in tagger.PENN_TAGSET: + self.assertIsInstance(tag, unicode) + + def test_word_encoding(self): + word = tagger.Word(token=u"æßđħłłþłłł@æµß", + lemma=u"ŧłþłßæ#¶ŋħ~#~@", + pos=u"øĸŋøħþ€ĸłþ€øæ«»¢") + + self.assertIsInstance(word.token, unicode) + self.assertEqual(word.token, u"æßđħłłþłłł@æµß") + self.assertIsInstance(word.lemma, unicode) + self.assertEqual(word.lemma, u"ŧłþłßæ#¶ŋħ~#~@") + self.assertIsInstance(word.pos, unicode) + self.assertEqual(word.pos, u"øĸŋøħþ€ĸłþ€øæ«»¢") + + def test_word_wrong_encoding(self): + # Token not unicode + self.assertRaises(ValueError, tagger.Word, "æßđħłłþłłł@æµß", + u"ŧłþłßæ#¶ŋħ~#~@", u"øĸŋøħþ€ĸłþ€øæ«»¢") + # Lemma not unicode + self.assertRaises(ValueError, tagger.Word, u"æßđħłłþłłł@æµß", + "ŧłþłßæ#¶ŋħ~#~@", u"øĸŋøħþ€ĸłþ€øæ«»¢") + # Pos not unicode + self.assertRaises(ValueError, tagger.Word, u"æßđħłłþłłł@æµß", + u"ŧłþłßæ#¶ŋħ~#~@", "øĸŋøħþ€ĸłþ€øæ«»¢") + + def test_word_attrib_set(self): + word = tagger.Word(u"æßđħłłþłłł@æµß") + word.lemma = u"ŧłþłßæ#¶ŋħ~#~@" + word.pos = u"øĸŋøħþ€ĸłþ€øæ«»¢" + + self.assertIsInstance(word.token, unicode) + self.assertEqual(word.token, u"æßđħłłþłłł@æµß") + self.assertIsInstance(word.lemma, unicode) + self.assertEqual(word.lemma, u"ŧłþłßæ#¶ŋħ~#~@") + self.assertIsInstance(word.pos, unicode) + self.assertEqual(word.pos, u"øĸŋøħþ€ĸłþ€øæ«»¢") + + def test_word_wrong_attrib_set(self): + word = tagger.Word(u"æßđħłłþłłł@æµß") + + # Token not unicode + self.assertRaises(ValueError, setattr, word, "token", "æßđħłłþłłł@æµß") + # Lemma not unicode + self.assertRaises(ValueError, setattr, word, "lemma", "ŧłþłßæ#¶ŋħ~#~@") + # Pos not unicode + self.assertRaises(ValueError, setattr, word, "pos", "øĸŋøħþ€ĸłþ€øæ«»¢") diff --git a/tests/testapp/__init__.py b/tests/testapp/__init__.py index 03df946..9cdf0f0 100644 --- a/tests/testapp/__init__.py +++ b/tests/testapp/__init__.py @@ -12,4 +12,4 @@ Init for testapp quepy. """ -from basic import * \ No newline at end of file +from .basic import * \ No newline at end of file diff --git a/tests/testapp/__init__.py.bak b/tests/testapp/__init__.py.bak new file mode 100644 index 0000000..03df946 --- /dev/null +++ b/tests/testapp/__init__.py.bak @@ -0,0 +1,15 @@ +#!/usr/bin/env python +# coding: utf-8 + +# Copyright (c) 2012, Machinalis S.R.L. +# This file is part of quepy and is distributed under the Modified BSD License. +# You should have received a copy of license in the LICENSE file. +# +# Authors: Rafael Carrascosa +# Gonzalo Garcia Berrotaran + +""" +Init for testapp quepy. +""" + +from basic import * \ No newline at end of file From 6c30cf393a9102e38b1dec01607eac67daea7506 Mon Sep 17 00:00:00 2001 From: Bowen Fu Date: Tue, 22 Aug 2017 14:05:37 +0800 Subject: [PATCH 03/10] remove str.decode --- quepy/encodingpolicy.py | 5 ----- quepy/jiebatagger.py | 3 +-- quepy/nltktagger.py | 4 +--- scripts/quepy | 4 +--- 4 files changed, 3 insertions(+), 13 deletions(-) diff --git a/quepy/encodingpolicy.py b/quepy/encodingpolicy.py index dcfca31..4328b27 100644 --- a/quepy/encodingpolicy.py +++ b/quepy/encodingpolicy.py @@ -27,11 +27,6 @@ def encoding_flexible_conversion(string, complain=False): if isinstance(string, str): return string - try: - ustring = string.decode(settings.DEFAULT_ENCODING) - except UnicodeError: - message = "Argument must be unicode or {}" - raise ValueError(message.format(settings.DEFAULT_ENCODING)) if complain: logger.warning("Forced to guess the encoding of {!r}, please " "provide a unicode string instead".format(string)) diff --git a/quepy/jiebatagger.py b/quepy/jiebatagger.py index 95f9526..34ab95a 100644 --- a/quepy/jiebatagger.py +++ b/quepy/jiebatagger.py @@ -57,8 +57,7 @@ def run_jiebatagger(string): for token, pos in token_tags: word = Word(token) # Eliminates stuff like JJ|CC - # decode ascii because they are the penn-like POS tags (are ascii). - word.pos = pos.split("|")[0].decode("ascii") + word.pos = pos.split("|")[0] mtag = penn_to_morphy_tag(word.pos) word.lemma = None diff --git a/quepy/nltktagger.py b/quepy/nltktagger.py index 15b2e11..582e58c 100644 --- a/quepy/nltktagger.py +++ b/quepy/nltktagger.py @@ -61,8 +61,7 @@ def run_nltktagger(string, nltk_data_path=None): for token, pos in tags: word = Word(token) # Eliminates stuff like JJ|CC - # decode ascii because they are the penn-like POS tags (are ascii). - word.pos = pos.split("|")[0].decode("ascii") + word.pos = pos.split("|")[0] mtag = penn_to_morphy_tag(word.pos) # Nice shooting, son. What's your name? @@ -71,7 +70,6 @@ def run_nltktagger(string, nltk_data_path=None): # In this case lemma is example-based, because if it's rule based # the result should be unicode (input was unicode). # Since english is ascii the decoding is ok. - lemma = lemma.decode("ascii") word.lemma = lemma if word.lemma is None: word.lemma = word.token.lower() diff --git a/scripts/quepy b/scripts/quepy index 6b477c8..eb784b3 100755 --- a/scripts/quepy +++ b/scripts/quepy @@ -62,7 +62,7 @@ class CommandNotFound(Exception): def startapp(name): - san_name = name.decode("ascii", "ignore").lower().replace(" ", "_") + san_name = name.lower().replace(" ", "_") header_template = "# coding: utf-8\n\n" main_template = "import quepy\n" \ "{san_name} = quepy.install(\"{name}\")\n" @@ -125,7 +125,6 @@ def print_version(): def graph_query(app_name, question): - question = question.decode("ascii") # Set the path to the app sys.path.append(os.getcwd()) @@ -178,7 +177,6 @@ def print_tags(app_name, text): (app_name, error) sys.exit(1) - text = text.decode("ascii") tagger = quepy.tagger.get_tagger() tagger_out = tagger(text) From 3c1cf17a1606bf9e772173a57c7ff288ef937240 Mon Sep 17 00:00:00 2001 From: Bowen Fu Date: Tue, 22 Aug 2017 14:09:57 +0800 Subject: [PATCH 04/10] remove bak --- docs/conf.py.bak | 242 ------------------------ quepy/cntagger.py.bak | 74 -------- quepy/dot_generation.py.bak | 89 --------- quepy/dsl.py.bak | 106 ----------- quepy/encodingpolicy.py.bak | 48 ----- quepy/expression.py.bak | 210 --------------------- quepy/generation.py.bak | 38 ---- quepy/jiebatagger.py.bak | 68 ------- quepy/mql_generation.py.bak | 141 -------------- quepy/nltktagger.py.bak | 81 -------- quepy/quepyapp.py.bak | 162 ---------------- quepy/settings.py.bak | 31 ---- quepy/sparql_generation.py.bak | 70 ------- quepy/tagger.py.bak | 74 -------- tests/random_expression.py.bak | 74 -------- tests/test_dot_generation.py.bak | 87 --------- tests/test_dsl.py.bak | 97 ---------- tests/test_expressions.py.bak | 279 ---------------------------- tests/test_mql_generation.py.bak | 68 ------- tests/test_nltktagger.py.bak | 30 --- tests/test_parsing.py.bak | 128 ------------- tests/test_quepyapp.py.bak | 48 ----- tests/test_sparql_generation.py.bak | 102 ---------- tests/test_tagger.py.bak | 67 ------- 24 files changed, 2414 deletions(-) delete mode 100644 docs/conf.py.bak delete mode 100644 quepy/cntagger.py.bak delete mode 100644 quepy/dot_generation.py.bak delete mode 100644 quepy/dsl.py.bak delete mode 100644 quepy/encodingpolicy.py.bak delete mode 100644 quepy/expression.py.bak delete mode 100644 quepy/generation.py.bak delete mode 100644 quepy/jiebatagger.py.bak delete mode 100644 quepy/mql_generation.py.bak delete mode 100644 quepy/nltktagger.py.bak delete mode 100644 quepy/quepyapp.py.bak delete mode 100644 quepy/settings.py.bak delete mode 100644 quepy/sparql_generation.py.bak delete mode 100644 quepy/tagger.py.bak delete mode 100644 tests/random_expression.py.bak delete mode 100644 tests/test_dot_generation.py.bak delete mode 100644 tests/test_dsl.py.bak delete mode 100644 tests/test_expressions.py.bak delete mode 100644 tests/test_mql_generation.py.bak delete mode 100644 tests/test_nltktagger.py.bak delete mode 100644 tests/test_parsing.py.bak delete mode 100644 tests/test_quepyapp.py.bak delete mode 100644 tests/test_sparql_generation.py.bak delete mode 100644 tests/test_tagger.py.bak diff --git a/docs/conf.py.bak b/docs/conf.py.bak deleted file mode 100644 index 10cf615..0000000 --- a/docs/conf.py.bak +++ /dev/null @@ -1,242 +0,0 @@ -# -*- coding: utf-8 -*- -# -# Quepy documentation build configuration file, created by -# sphinx-quickstart on Mon Nov 5 14:12:47 2012. -# -# This file is execfile()d with the current directory set to its containing dir. -# -# Note that not all possible configuration values are present in this -# autogenerated file. -# -# All configuration values have a default; values that are commented out -# serve to show the default. - -import sys, os - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -#sys.path.insert(0, os.path.abspath('.')) - -# -- General configuration ----------------------------------------------------- - -# If your documentation needs a minimal Sphinx version, state it here. -#needs_sphinx = '1.0' - -# Add any Sphinx extension module names here, as strings. They can be extensions -# coming with Sphinx (named 'sphinx.ext.*') or your custom ones. -extensions = ['sphinx.ext.autodoc'] - -# Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] - -# The suffix of source filenames. -source_suffix = '.rst' - -# The encoding of source files. -#source_encoding = 'utf-8-sig' - -# The master toctree document. -master_doc = 'index' - -# General information about the project. -project = u'Quepy' -copyright = u'2012, Machinalis' - -# The version info for the project you're documenting, acts as replacement for -# |version| and |release|, also used in various other places throughout the -# built documents. -# -# The short X.Y version. -version = '0.1' -# The full version, including alpha/beta/rc tags. -release = '0.1' - -# The language for content autogenerated by Sphinx. Refer to documentation -# for a list of supported languages. -#language = None - -# There are two options for replacing |today|: either, you set today to some -# non-false value, then it is used: -#today = '' -# Else, today_fmt is used as the format for a strftime call. -#today_fmt = '%B %d, %Y' - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -exclude_patterns = ['_build'] - -# The reST default role (used for this markup: `text`) to use for all documents. -#default_role = None - -# If true, '()' will be appended to :func: etc. cross-reference text. -#add_function_parentheses = True - -# If true, the current module name will be prepended to all description -# unit titles (such as .. function::). -#add_module_names = True - -# If true, sectionauthor and moduleauthor directives will be shown in the -# output. They are ignored by default. -#show_authors = False - -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' - -# A list of ignored prefixes for module index sorting. -#modindex_common_prefix = [] - - -# -- Options for HTML output --------------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -html_theme = 'default' - -# Theme options are theme-specific and customize the look and feel of a theme -# further. For a list of options available for each theme, see the -# documentation. -#html_theme_options = {} - -# Add any paths that contain custom themes here, relative to this directory. -#html_theme_path = [] - -# The name for this set of Sphinx documents. If None, it defaults to -# " v documentation". -#html_title = None - -# A shorter title for the navigation bar. Default is the same as html_title. -#html_short_title = None - -# The name of an image file (relative to this directory) to place at the top -# of the sidebar. -#html_logo = None - -# The name of an image file (within the static path) to use as favicon of the -# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 -# pixels large. -#html_favicon = None - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] - -# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, -# using the given strftime format. -#html_last_updated_fmt = '%b %d, %Y' - -# If true, SmartyPants will be used to convert quotes and dashes to -# typographically correct entities. -#html_use_smartypants = True - -# Custom sidebar templates, maps document names to template names. -#html_sidebars = {} - -# Additional templates that should be rendered to pages, maps page names to -# template names. -#html_additional_pages = {} - -# If false, no module index is generated. -#html_domain_indices = True - -# If false, no index is generated. -#html_use_index = True - -# If true, the index is split into individual pages for each letter. -#html_split_index = False - -# If true, links to the reST sources are added to the pages. -#html_show_sourcelink = True - -# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. -#html_show_sphinx = True - -# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. -#html_show_copyright = True - -# If true, an OpenSearch description file will be output, and all pages will -# contain a tag referring to it. The value of this option must be the -# base URL from which the finished HTML is served. -#html_use_opensearch = '' - -# This is the file name suffix for HTML files (e.g. ".xhtml"). -#html_file_suffix = None - -# Output file base name for HTML help builder. -htmlhelp_basename = 'Quepydoc' - - -# -- Options for LaTeX output -------------------------------------------------- - -latex_elements = { -# The paper size ('letterpaper' or 'a4paper'). -#'papersize': 'letterpaper', - -# The font size ('10pt', '11pt' or '12pt'). -#'pointsize': '10pt', - -# Additional stuff for the LaTeX preamble. -#'preamble': '', -} - -# Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, author, documentclass [howto/manual]). -latex_documents = [ - ('index', 'Quepy.tex', u'Quepy Documentation', - u'Machinalis', 'manual'), -] - -# The name of an image file (relative to this directory) to place at the top of -# the title page. -#latex_logo = None - -# For "manual" documents, if this is true, then toplevel headings are parts, -# not chapters. -#latex_use_parts = False - -# If true, show page references after internal links. -#latex_show_pagerefs = False - -# If true, show URL addresses after external links. -#latex_show_urls = False - -# Documents to append as an appendix to all manuals. -#latex_appendices = [] - -# If false, no module index is generated. -#latex_domain_indices = True - - -# -- Options for manual page output -------------------------------------------- - -# One entry per manual page. List of tuples -# (source start file, name, description, authors, manual section). -man_pages = [ - ('index', 'quepy', u'Quepy Documentation', - [u'Machinalis'], 1) -] - -# If true, show URL addresses after external links. -#man_show_urls = False - - -# -- Options for Texinfo output ------------------------------------------------ - -# Grouping the document tree into Texinfo files. List of tuples -# (source start file, target name, title, author, -# dir menu entry, description, category) -texinfo_documents = [ - ('index', 'Quepy', u'Quepy Documentation', - u'Machinalis', 'Quepy', 'One line description of project.', - 'Miscellaneous'), -] - -# Documents to append as an appendix to all manuals. -#texinfo_appendices = [] - -# If false, no module index is generated. -#texinfo_domain_indices = True - -# How to display URL addresses: 'footnote', 'no', or 'inline'. -#texinfo_show_urls = 'footnote' diff --git a/quepy/cntagger.py.bak b/quepy/cntagger.py.bak deleted file mode 100644 index 0b42946..0000000 --- a/quepy/cntagger.py.bak +++ /dev/null @@ -1,74 +0,0 @@ -# coding: utf-8 - -# Copyright (c) 2012, Machinalis S.R.L. -# This file is part of quepy and is distributed under the Modified BSD License. -# You should have received a copy of license in the LICENSE file. -# -# Authors: Rafael Carrascosa -# Gonzalo Garcia Berrotaran - -import logging - -from quepy import settings -from quepy.encodingpolicy import assert_valid_encoding - -logger = logging.getLogger("quepy.tagger") -PENN_TAGSET = set(u"$ `` '' ( ) , -- . : CC CD DT EX FW IN JJ JJR JJS LS MD " - "NN NNP NNPS NNS PDT POS PRP PRP$ RB RBR RBS RP SYM TO UH " - "VB VBD VBG VBN VBP VBZ WDT WP WP$ WRB".split()) - - -class TaggingError(Exception): - """ - Error parsing tagger's output. - """ - pass - - -class Word(object): - """ - Representation of a tagged word. - Contains *token*, *lemma*, *pos tag* and optionally a *probability* of - that tag. - """ - _encoding_attrs = u"token lemma pos".split() - _attrs = _encoding_attrs + [u"prob"] - - def __init__(self, token, lemma=None, pos=None, prob=None): - self.pos = pos - self.prob = prob - self.lemma = lemma - self.token = token - - def __setattr__(self, name, value): - if name in self._encoding_attrs and value is not None: - assert_valid_encoding(value) - object.__setattr__(self, name, value) - - def __unicode__(self): - attrs = (getattr(self, name, u"-") for name in self._attrs) - return u"|".join(str(x) for x in attrs) - - def __repr__(self): - return unicode(self) - - -def get_cntagger(): - """ - Return a tagging function given some app settings. - `Settings` is the settings module of an app. - The returned value is a function that receives a unicode string and returns - a list of `Word` instances. - """ - from quepy.jiebatagger import run_jiebatagger - tagger_function = lambda x: run_jiebaagger(x) - - def wrapper(string): - assert_valid_encoding(string) - words = tagger_function(string) - for word in words: - if word.pos not in PENN_TAGSET: - logger.warning("Tagger emmited a non-penn " - "POS tag {!r}".format(word.pos)) - return words - return wrapper diff --git a/quepy/dot_generation.py.bak b/quepy/dot_generation.py.bak deleted file mode 100644 index e812990..0000000 --- a/quepy/dot_generation.py.bak +++ /dev/null @@ -1,89 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -Dot generation code. -""" - -import random -from quepy.expression import isnode -from quepy.dsl import IsRelatedTo, HasKeyword -from quepy.encodingpolicy import assert_valid_encoding - - -def escape(x, add_quotes=True): - x = unicode(x) - x = x.replace(u" ", u"_") - x = x.replace(u"\n", u"") - x = x.replace(u"\00", u"") - x = x.replace(u"[", u"") - x = x.replace(u"]", u"") - x = x.replace(u"\\", u"") - if x.count("\""): - x = x.replace(u"\"", u"\\\"") - if add_quotes: - x = u'"' + x + u'"' - return x - - -def adapt(x): - if isnode(x): - x = u"x{}".format(x) - return x - if isinstance(x, basestring): - assert_valid_encoding(x) - x = escape(x) - if x.startswith(u"\""): - return x - return u'"{}"'.format(x) - return unicode(x) - - -def expression_to_dot(e): - d = {u"rdf:type": dot_type, - HasKeyword.relation: dot_keyword, - IsRelatedTo: lambda x, y: dot_arc(x, u"", y)} - s = u"digraph G {{\n{0} [shape=house];\n{1}\n}}\n" - xs = [] - for node in e.iter_nodes(): - for relation, other in e.iter_edges(node): - node1 = adapt(node) - node2 = adapt(other) - relation = escape(relation, add_quotes=False) - - if relation in d: - x = d[relation](node1, node2) - else: - x = dot_arc(node1, relation, node2) - xs.append(x) - return None, s.format(adapt(e.head), u"".join(xs)) - - -def dot_arc(a, label, b): - assert u" " not in a and u" " not in b - assert u"\n" not in a + label + b - return u"{0} -> {1} [label=\"{2}\"];\n".format(a, b, label) - - -def dot_type(a, t): - s = u"{0} [shape=box];\n".format(t) - return s + u"{0} -> {1} [color=red, arrowhead=empty];".format(a, t) - - -def dot_attribute(a, key): - blank = id(a) - s = u"{0} [shape=none label={1}];\n".format(blank, key) - return s + u"{0} -> {1};".format(a, blank) - - -def dot_keyword(a, key): - blank = u"{0:.30f}".format(random.random()) - blank = u"blank" + blank.replace(u".", u"") - s = u"{0} [shape=none label={1}];\n".format(blank, key) - return s + u"{0} -> {1} [style=dashed];".format(a, blank) - - -def dot_fixed_type(a, fixedtype): - blank = u"{0:.30f}".format(random.random()) - blank = u"blank" + blank.replace(u".", u"") - s = u"{0} [shape=box label={1}];\n".format(blank, fixedtype) - return s + u"{0} -> {1} [color=red, arrowhead=empty];".format(a, blank) diff --git a/quepy/dsl.py.bak b/quepy/dsl.py.bak deleted file mode 100644 index 74b77f5..0000000 --- a/quepy/dsl.py.bak +++ /dev/null @@ -1,106 +0,0 @@ -# coding: utf-8 -# pylint: disable=C0111 - -# Copyright (c) 2012, Machinalis S.R.L. -# This file is part of quepy and is distributed under the Modified BSD License. -# You should have received a copy of license in the LICENSE file. -# -# Authors: Rafael Carrascosa -# Gonzalo Garcia Berrotaran - -""" -Domain specific language definitions. -""" - -from copy import copy -from quepy.expression import Expression -from quepy.encodingpolicy import encoding_flexible_conversion - - -class FixedRelation(Expression): - """ - Expression for a fixed relation. It states that "A is related to B" - through the relation defined in `relation`. - """ - - relation = None - reverse = False - - def __init__(self, destination, reverse=None): - if reverse is None: - reverse = self.reverse - super(FixedRelation, self).__init__() - if self.relation is None: - raise ValueError("You *must* define the `relation` " - "class attribute to use this class.") - self.nodes = copy(destination.nodes) - self.head = destination.head - self.decapitate(self.relation, reverse) - - -class FixedType(Expression): - """ - Expression for a fixed type. - This captures the idea of something having an specific type. - """ - - fixedtype = None - fixedtyperelation = u"rdf:type" # FIXME: sparql specific - - def __init__(self): - super(FixedType, self).__init__() - if self.fixedtype is None: - raise ValueError("You *must* define the `fixedtype` " - "class attribute to use this class.") - self.fixedtype = encoding_flexible_conversion(self.fixedtype) - self.fixedtyperelation = \ - encoding_flexible_conversion(self.fixedtyperelation) - self.add_data(self.fixedtyperelation, self.fixedtype) - - -class FixedDataRelation(Expression): - """ - Expression for a fixed relation. This is - "A is related to Data" through the relation defined in `relation`. - """ - - relation = None - language = None - - def __init__(self, data): - super(FixedDataRelation, self).__init__() - if self.relation is None: - raise ValueError("You *must* define the `relation` " - "class attribute to use this class.") - self.relation = encoding_flexible_conversion(self.relation) - if self.language is not None: - self.language = encoding_flexible_conversion(self.language) - data = u"\"{0}\"@{1}".format(data, self.language) - self.add_data(self.relation, data) - - -class HasKeyword(FixedDataRelation): - """ - Abstraction of an information retrieval key, something standarized used - to look up things in the database. - """ - relation = u"quepy:Keyword" - - def __init__(self, data): - data = self.sanitize(data) - super(HasKeyword, self).__init__(data) - - @staticmethod - def sanitize(text): - # User can redefine this method if needed - return text - - -class HasType(FixedRelation): - relation = "rdf:type" - - -class IsRelatedTo(FixedRelation): - pass -# Looks weird, yes, here I am using `IsRelatedTo` as a unique identifier. -IsRelatedTo.relation = IsRelatedTo diff --git a/quepy/encodingpolicy.py.bak b/quepy/encodingpolicy.py.bak deleted file mode 100644 index a415f59..0000000 --- a/quepy/encodingpolicy.py.bak +++ /dev/null @@ -1,48 +0,0 @@ -# coding: utf-8 - -# Copyright (c) 2012, Machinalis S.R.L. -# This file is part of quepy and is distributed under the Modified BSD License. -# You should have received a copy of license in the LICENSE file. -# -# Authors: Rafael Carrascosa -# Gonzalo Garcia Berrotaran - -""" -Functions to do encoding checkings. -""" - -import logging -from quepy import settings -logger = logging.getLogger("quepy.encodingpolicy") - - -def encoding_flexible_conversion(string, complain=False): - """ - Converts string to the proper encoding if it's possible - and if it's not raises a ValueError exception. - - If complain it's True, it will emit a logging warning about - converting a string that had to be on the right encoding. - """ - - if isinstance(string, unicode): - return string - try: - ustring = string.decode(settings.DEFAULT_ENCODING) - except UnicodeError: - message = u"Argument must be unicode or {}" - raise ValueError(message.format(settings.DEFAULT_ENCODING)) - if complain: - logger.warning(u"Forced to guess the encoding of {!r}, please " - u"provide a unicode string instead".format(string)) - return ustring - - -def assert_valid_encoding(string): - """ - If string it's not in a valid encoding it raises a - ValueError exception. - """ - - if not isinstance(string, unicode): - raise ValueError(u"Argument must be unicode") diff --git a/quepy/expression.py.bak b/quepy/expression.py.bak deleted file mode 100644 index 0f32310..0000000 --- a/quepy/expression.py.bak +++ /dev/null @@ -1,210 +0,0 @@ -# coding: utf-8 - -# Copyright (c) 2012, Machinalis S.R.L. -# This file is part of quepy and is distributed under the Modified BSD License. -# You should have received a copy of license in the LICENSE file. -# -# Authors: Rafael Carrascosa -# Gonzalo Garcia Berrotaran - -""" -This file implements the ``Expression`` class. - -``Expression`` is the base class for all the semantic representations in quepy. -It's meant to carry all the information necessary to build a database query in -an abstract form. - -By design it's aimed specifically to represent a SPARQL query, but it should -be able to represent queries in other database languages too. - -A (simple) SPARQL query can be thought as a subgraph that has to match into a -larger graph (the database). Each node of the subgraph is a variable and every -edge a relation. So in order to represent a query, ``Expression`` implements a -this subgraph using adjacency lists. - -Also, ``Expression`` instances are meant to be combined with each other somehow -to make complex queries out of simple ones (this is one of the main objectives -of quepy). - -To do that, every ``Expression`` has a special node called the ``head``, which -is the target node (variable) of the represented query. All operations over -``Expression`` instances work on the ``head`` node, leaving the rest of the -nodes intact. - -So ``Expression`` graphs are not built by explicitly adding nodes and edges -like any other normal graph. Instead they are built by a combination of the -following basic operations: - - - ``__init__``: When a ``Expression`` is instantiated a single solitary - node is created in the graph. - - - ``decapitate``: Creates a blank node and makes it the new ``head`` of the - ``Expression``. Then it adds an edge (a relation) linking - this new head to the old one. So in a single operation a - node and an edge are added. Used to represent stuff like - ``?x rdf:type ?y``. - - - ``add_data``: Adds a relation into some constant data from the ``head`` - node of the ``Expression``. Used to represent stuff like - ``?x rdf:label "John Von Neumann"``. - - - ``merge``: Given two ``Expressions``, it joins their graphs preserving - every node and every edge intact except for their ``head`` - nodes. - The ``head`` nodes are merged into a single node that is the - new ``head`` and shares all the edges of the previous heads. - This is used to combine two graphs like this: - - :: - - A = ?x rdf:type ?y - B = ?x rdf:label "John Von Neumann" - - Into a new one: - - :: - - A + B = ?x rdf:type ?y; - ?x rdf:label "John Von Neumann" - - -You might be saying "Why?! oh gosh why you did it like this?!". -The reasons are: - - - It allows other parts of the code to build queries in a super - intuive language, like ``IsPerson() + HasKeyword("Russell")``. - Go and see the DBpedia example. - - - You can only build connected graphs (ie, no useless variables in query). - - - You cannot have variable name clashes. - - - You cannot build cycles into the graph (could be a con to some, a - plus to other(it's a plus to me)) - - - There are just 3 really basic operations and their semantics are defined - concisely without special cases (if you care for that kind of stuff - (I do)). -""" - - -from collections import defaultdict -from copy import deepcopy - - -def isnode(x): - return isinstance(x, int) - - -class Expression(object): - - def __init__(self): - """ - Creates a new graph with a single solitary blank node. - """ - self.nodes = [] - self.head = self._add_node() - - def _add_node(self): - """ - Adds a blank node to the graph and returns its index (a unique - identifier). - """ - i = len(self.nodes) - self.nodes.append([]) - return i - - def get_head(self): - """ - Returns the index (the unique identifier) of the head node. - """ - return self.head - - def merge(self, other): - """ - Given other Expression, it joins their graphs preserving every - node and every edge intact except for the ``head`` nodes. - The ``head`` nodes are merged into a single node that is the new - ``head`` and shares all the edges of the previous heads. - """ - translation = defaultdict(self._add_node) - translation[other.head] = self.head - for node in other.iter_nodes(): - for relation, dest in other.iter_edges(node): - xs = self.nodes[translation[node]] - if isnode(dest): - dest = translation[dest] - xs.append((relation, dest)) - - def decapitate(self, relation, reverse=False): - """ - Creates a new blank node and makes it the ``head`` of the - Expression. Then it adds an edge (a ``relation``) linking the - the new head to the old one. So in a single operation a - node and an edge are added. - If ``reverse`` is ``True`` then the ``relation`` links the old head to - the new head instead of the opposite (some relations are not - commutative). - """ - oldhead = self.head - self.head = self._add_node() - if reverse: - self.nodes[oldhead].append((relation, self.head)) - else: - self.nodes[self.head].append((relation, oldhead)) - - def add_data(self, relation, value): - """ - Adds a ``relation`` to some constant ``value`` to the ``head`` of the - Expression. - ``value`` is recommended be of type: - - ``unicode`` - - ``str`` and can be decoded using the default encoding (settings.py) - - A custom class that implements a ``__unicode__`` method. - - It can *NEVER* be an ``int``. - - You should not use this to relate nodes in the graph, only to add - data fields to a node. - To relate nodes in a graph use a combination of merge and decapitate. - """ - assert not isnode(value) - self.nodes[self.head].append((relation, value)) - - def iter_nodes(self): - """ - Iterates the indexes (the unique identifiers) of the Expression nodes. - """ - return xrange(len(self.nodes)) - - def iter_edges(self, node): - """ - Iterates over the pairs: ``(relation, index)`` which are the neighbors - of ``node`` in the expression graph, where: - - ``node`` is the index of the node (the unique identifier). - - ``relation`` is the label of the edge between the nodes - - ``index`` is the index of the neighbor (the unique identifier). - """ - return iter(self.nodes[node]) - - def __add__(self, other): - """ - Merges ``self`` and ``other`` in a new Expression instance. - Ie, ``self`` and ``other`` are not modified. - """ - new = deepcopy(self) - new.merge(other) - return new - - def __iadd__(self, other): - """ - Merges ``self`` and ``other`` into ``self`` - ``other`` is not modified but the original data in ``self`` is lost. - """ - self.merge(other) - return self - - def __len__(self): - """ - Amount of nodes in the graph. - """ - return len(self.nodes) diff --git a/quepy/generation.py.bak b/quepy/generation.py.bak deleted file mode 100644 index 87b5d9f..0000000 --- a/quepy/generation.py.bak +++ /dev/null @@ -1,38 +0,0 @@ -# coding: utf-8 - -# Copyright (c) 2012, Machinalis S.R.L. -# This file is part of quepy and is distributed under the Modified BSD License. -# You should have received a copy of license in the LICENSE file. -# -# Authors: Rafael Carrascosa -# Gonzalo Garcia Berrotaran - -""" -Code generation from an expression to a database language. - -The currently supported languages are: - * MQL - * Sparql - * Dot: generation of graph images mainly for debugging. -""" - -from quepy.mql_generation import generate_mql -from quepy.dot_generation import expression_to_dot -from quepy.sparql_generation import expression_to_sparql - - -def get_code(expression, language): - """ - Given an expression and a supported language, it - returns the query for that expression on that language. - """ - - if language == "sparql": - return expression_to_sparql(expression) - elif language == "dot": - return expression_to_dot(expression) - elif language == "mql": - return generate_mql(expression) - else: - message = u"Language '{}' is not supported" - raise ValueError(message.format(language)) diff --git a/quepy/jiebatagger.py.bak b/quepy/jiebatagger.py.bak deleted file mode 100644 index a9a238d..0000000 --- a/quepy/jiebatagger.py.bak +++ /dev/null @@ -1,68 +0,0 @@ -# coding: utf-8 - -# Copyright (c) 2012, Machinalis S.R.L. -# This file is part of quepy and is distributed under the Modified BSD License. -# You should have received a copy of license in the LICENSE file. -# -# Authors: Rafael Carrascosa -# Gonzalo Garcia Berrotaran - -""" -Tagging using Jieba. -""" - -# Requiered data files are: -# - "averaged_perceptron_tagger" in Models -# - "wordnet" in Corpora - -import jieba -from quepy.tagger import Word -from quepy.encodingpolicy import assert_valid_encoding - -_penn_to_morphy_tag = {} - - -def penn_to_morphy_tag(tag): - assert_valid_encoding(tag) - - for penn, morphy in _penn_to_morphy_tag.iteritems(): - if tag.startswith(penn): - return morphy - return None - - -def run_jiebatagger(string): - """ - Runs jieba tagger on `string` and returns a list of - :class:`quepy.tagger.Word` objects. - """ - assert_valid_encoding(string) - global _penn_to_morphy_tag - - from nltk.corpus import wordnet - - if not _penn_to_morphy_tag: - _penn_to_morphy_tag = { - u'NN': wordnet.NOUN, - u'JJ': wordnet.ADJ, - u'VB': wordnet.VERB, - u'RB': wordnet.ADV, - } - - # Recommended tokenizer doesn't handle non-ascii characters very well - #tokens = jieba.word_tokenize(string) - token_tags = jieba.posseg.cut(string) - - words = [] - for token, pos in token_tags: - word = Word(token) - # Eliminates stuff like JJ|CC - # decode ascii because they are the penn-like POS tags (are ascii). - word.pos = pos.split("|")[0].decode("ascii") - - mtag = penn_to_morphy_tag(word.pos) - word.lemma = None - - words.append(word) - - return words diff --git a/quepy/mql_generation.py.bak b/quepy/mql_generation.py.bak deleted file mode 100644 index 97b3bd7..0000000 --- a/quepy/mql_generation.py.bak +++ /dev/null @@ -1,141 +0,0 @@ -# -*- coding: utf-8 -*- - -import re -import json -from quepy.dsl import IsRelatedTo -from quepy.expression import isnode -from quepy.encodingpolicy import encoding_flexible_conversion - - -def choose_start_node(e): - """ - Choose a node of the `Expression` such that no property leading to a data - has to be reversed (with !). - """ - # Since data "nodes" have no outgoing edges it sufices to find any node - # with an outgoing edge. - for node in e.iter_nodes(): - if list(e.iter_edges(node)): - return node - return node - - -def safely_to_unicode(x): - """ - Given an "edge" (a relation) or "a data" from an `Expression` graph - transform it into a unicode string fitted for insertion into a MQL query. - """ - if isinstance(x, unicode): - return x - if isinstance(x, str): - return encoding_flexible_conversion(x) - if isinstance(x, IsRelatedTo): - return u"/type/reflect/any_master" - return unicode(x) # FIXME: Any object is unicode-able, this is error prone - - -def to_bidirected_graph(e): - """ - Rewrite the graph such that there are reversed edges for every forward - edge. - If an edge goes into a data, it should not be reversed. - """ - graph = {node: [] for node in e.iter_nodes()} - for node in e.iter_nodes(): - for relation, other in e.iter_edges(node): - relation = safely_to_unicode(relation) - if isnode(other): - graph[other].append((u"!" + relation, node)) - else: - other = safely_to_unicode(other) - graph[node].append((relation, other)) - assert all(isnode(x) for x in graph) and len(e) == len(graph) - return graph - - -def post_order_depth_first(graph, start): - """ - Iterate over the nodes of the graph (is a tree) in a way such that every - node is preceded by it's childs. - `graph` is a dict that represents the `Expression` graph. It's a tree too - beacuse Expressions are trees. - `start` is the node to use as the root of the tree. - """ - q = [start] - seen = set() - i = 0 - while i != len(graph): - node = q[i] - seen.add(node) - i += 1 - for _, other in graph[node]: - if isnode(other) and other not in seen: - q.append(other) - assert len(q) == len(graph) - q.reverse() - return q - - -def paths_from_root(graph, start): - """ - Generates paths from `start` to every other node in `graph` and puts it in - the returned dictionary `paths`. - ie.: `paths_from_node(graph, start)[node]` is a list of the edge names used - to get to `node` form `start`. - """ - paths = {start: []} - q = [start] - seen = set() - while q: - node = q.pop() - seen.add(node) - for relation, child in graph[node]: - if isnode(child) and child not in seen: - q.append(child) - paths[child] = paths[node] + [relation] - return paths - - -def generate_mql(e): - """ - Generates a MQL query for the `Expression` `e`. - """ - start = choose_start_node(e) - graph = to_bidirected_graph(e) - generated = {} - for node in post_order_depth_first(graph, start): - d = {} - for relation, other in graph[node]: - if isnode(other): - try: - other = generated[other] - except KeyError: - continue # other is not in post_order_depth_first order - d[relation] = other - generated[node] = [d] - - mql_query = json.dumps(generated[start], sort_keys=True, - indent=2, separators=(',', ': ')) - mql_query = _tidy(mql_query) - target = paths_from_root(graph, start)[e.get_head()] - return target, mql_query - - -def _tidy(mql): - """ - Given a json representing a MQL query it collapses spaces between - braces and curly braces to make it look tidy. - """ - def replacement_function(match): - text = match.group(0) - if text.startswith("[") and text.endswith("]"): - return "[{}]" - elif text.startswith("["): - return "[{" - indent = 0 - match = re.search("}[ \t]*\n(\s*?)\]", text) - if match: - indent = len(match.group(1)) - return " " * indent + "}]" - return re.sub("\[\s*{\s*}\s*\]|\[\s+{|[ \t]*}\s+\]", - replacement_function, mql) diff --git a/quepy/nltktagger.py.bak b/quepy/nltktagger.py.bak deleted file mode 100644 index bbd4455..0000000 --- a/quepy/nltktagger.py.bak +++ /dev/null @@ -1,81 +0,0 @@ -# coding: utf-8 - -# Copyright (c) 2012, Machinalis S.R.L. -# This file is part of quepy and is distributed under the Modified BSD License. -# You should have received a copy of license in the LICENSE file. -# -# Authors: Rafael Carrascosa -# Gonzalo Garcia Berrotaran - -""" -Tagging using NLTK. -""" - -# Requiered data files are: -# - "averaged_perceptron_tagger" in Models -# - "wordnet" in Corpora - -import nltk -from quepy.tagger import Word -from quepy.encodingpolicy import assert_valid_encoding - -_penn_to_morphy_tag = {} - - -def penn_to_morphy_tag(tag): - assert_valid_encoding(tag) - - for penn, morphy in _penn_to_morphy_tag.iteritems(): - if tag.startswith(penn): - return morphy - return None - - -def run_nltktagger(string, nltk_data_path=None): - """ - Runs nltk tagger on `string` and returns a list of - :class:`quepy.tagger.Word` objects. - """ - assert_valid_encoding(string) - global _penn_to_morphy_tag - - if nltk_data_path: - nltk.data.path = nltk_data_path - - from nltk.corpus import wordnet - - if not _penn_to_morphy_tag: - _penn_to_morphy_tag = { - u'NN': wordnet.NOUN, - u'JJ': wordnet.ADJ, - u'VB': wordnet.VERB, - u'RB': wordnet.ADV, - } - - # Recommended tokenizer doesn't handle non-ascii characters very well - #tokens = nltk.word_tokenize(string) - tokens = nltk.wordpunct_tokenize(string) - tags = nltk.pos_tag(tokens) - - words = [] - for token, pos in tags: - word = Word(token) - # Eliminates stuff like JJ|CC - # decode ascii because they are the penn-like POS tags (are ascii). - word.pos = pos.split("|")[0].decode("ascii") - - mtag = penn_to_morphy_tag(word.pos) - # Nice shooting, son. What's your name? - lemma = wordnet.morphy(word.token, pos=mtag) - if isinstance(lemma, str): - # In this case lemma is example-based, because if it's rule based - # the result should be unicode (input was unicode). - # Since english is ascii the decoding is ok. - lemma = lemma.decode("ascii") - word.lemma = lemma - if word.lemma is None: - word.lemma = word.token.lower() - - words.append(word) - - return words diff --git a/quepy/quepyapp.py.bak b/quepy/quepyapp.py.bak deleted file mode 100644 index e3187d0..0000000 --- a/quepy/quepyapp.py.bak +++ /dev/null @@ -1,162 +0,0 @@ -# coding: utf-8 - -# Copyright (c) 2012, Machinalis S.R.L. -# This file is part of quepy and is distributed under the Modified BSD License. -# You should have received a copy of license in the LICENSE file. -# -# Authors: Rafael Carrascosa -# Gonzalo Garcia Berrotaran - -""" -Implements the Quepy Application API -""" - -import logging -from importlib import import_module -from types import ModuleType - -from quepy import settings -from quepy import generation -from quepy.parsing import QuestionTemplate -from quepy.tagger import get_tagger, TaggingError -from quepy.encodingpolicy import encoding_flexible_conversion - -logger = logging.getLogger("quepy.quepyapp") - - -def install(app_name): - """ - Installs the application and gives an QuepyApp object - """ - - module_paths = { - u"settings": u"{0}.settings", - u"parsing": u"{0}", - } - modules = {} - - for module_name, module_path in module_paths.iteritems(): - try: - modules[module_name] = import_module(module_path.format(app_name)) - except ImportError, error: - message = u"Error importing {0!r}: {1}" - raise ImportError(message.format(module_name, error)) - - return QuepyApp(**modules) - - -def question_sanitize(question): - question = question.replace("'", "\'") - question = question.replace("\"", "\\\"") - return question - - -class QuepyApp(object): - """ - Provides the quepy application API. - """ - - def __init__(self, parsing, settings): - """ - Creates the application based on `parsing`, `settings` modules. - """ - - assert isinstance(parsing, ModuleType) - assert isinstance(settings, ModuleType) - - self._parsing_module = parsing - self._settings_module = settings - - # Save the settings right after loading settings module - self._save_settings_values() - - self.tagger = get_tagger() - self.language = getattr(self._settings_module, "LANGUAGE", None) - if not self.language: - raise ValueError("Missing configuration for language") - - self.rules = [] - for element in dir(self._parsing_module): - element = getattr(self._parsing_module, element) - - try: - if issubclass(element, QuestionTemplate) and \ - element is not QuestionTemplate: - - self.rules.append(element()) - except TypeError: - continue - - self.rules.sort(key=lambda x: x.weight, reverse=True) - - def get_query(self, question): - """ - Given `question` in natural language, it returns - three things: - - - the target of the query in string format - - the query - - metadata given by the regex programmer (defaults to None) - - The query returned corresponds to the first regex that matches in - weight order. - """ - - question = question_sanitize(question) - for target, query, userdata in self.get_queries(question): - return target, query, userdata - return None, None, None - - def get_queries(self, question): - """ - Given `question` in natural language, it returns - three things: - - - the target of the query in string format - - the query - - metadata given by the regex programmer (defaults to None) - - The queries returned corresponds to the regexes that match in - weight order. - """ - question = encoding_flexible_conversion(question) - for expression, userdata in self._iter_compiled_forms(question): - target, query = generation.get_code(expression, self.language) - message = u"Interpretation {1}: {0}" - logger.debug(message.format(str(expression), - expression.rule_used)) - logger.debug(u"Query generated: {0}".format(query)) - yield target, query, userdata - - def _iter_compiled_forms(self, question): - """ - Returns all the compiled form of the question. - """ - - try: - words = list(self.tagger(question)) - except TaggingError: - logger.warning(u"Can't parse tagger's output for: '%s'", - question) - return - - logger.debug(u"Tagged question:\n" + - u"\n".join(u"\t{}".format(w for w in words))) - - for rule in self.rules: - expression, userdata = rule.get_interpretation(words) - if expression: - yield expression, userdata - - def _save_settings_values(self): - """ - Persists the settings values of the app to the settings module - so it can be accesible from another part of the software. - """ - - for key in dir(self._settings_module): - if key.upper() == key: - value = getattr(self._settings_module, key) - if isinstance(value, str): - value = encoding_flexible_conversion(value) - setattr(settings, key, value) diff --git a/quepy/settings.py.bak b/quepy/settings.py.bak deleted file mode 100644 index e69c1ab..0000000 --- a/quepy/settings.py.bak +++ /dev/null @@ -1,31 +0,0 @@ -# coding: utf-8 - -# Copyright (c) 2012, Machinalis S.R.L. -# This file is part of quepy and is distributed under the Modified BSD License. -# You should have received a copy of license in the LICENSE file. -# -# Authors: Rafael Carrascosa -# Gonzalo Garcia Berrotaran - -""" -Settings. -""" - -# Generated query language -LANGUAGE = "sparql" - -# NLTK config -NLTK_DATA_PATH = [] # List of paths with NLTK data - -# Encoding config -DEFAULT_ENCODING = "utf-8" - -# Sparql config -SPARQL_PREAMBLE = u""" -PREFIX owl: -PREFIX rdfs: -PREFIX rdf: -PREFIX foaf: -PREFIX skos: -PREFIX quepy: -""" diff --git a/quepy/sparql_generation.py.bak b/quepy/sparql_generation.py.bak deleted file mode 100644 index 3b1a218..0000000 --- a/quepy/sparql_generation.py.bak +++ /dev/null @@ -1,70 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -Sparql generation code. -""" - -from quepy import settings -from quepy.dsl import IsRelatedTo -from quepy.expression import isnode -from quepy.encodingpolicy import assert_valid_encoding - -_indent = u" " - - -def escape(string): - string = unicode(string) - string = string.replace("\n", "") - string = string.replace("\r", "") - string = string.replace("\t", "") - string = string.replace("\x0b", "") - if not string or any([x for x in string if 0 < ord(x) < 31]) or \ - string.startswith(":") or string.endswith(":"): - message = "Unable to generate sparql: invalid nodes or relation" - raise ValueError(message) - return string - - -def adapt(x): - if isnode(x): - x = u"?x{}".format(x) - return x - if isinstance(x, basestring): - assert_valid_encoding(x) - if x.startswith(u"\"") or ":" in x: - return x - return u'"{}"'.format(x) - return unicode(x) - - -def expression_to_sparql(e, full=False): - template = u"{preamble}\n" +\ - u"SELECT DISTINCT {select} WHERE {{\n" +\ - u"{expression}\n" +\ - u"}}\n" - head = adapt(e.get_head()) - if full: - select = u"*" - else: - select = head - y = 0 - xs = [] - for node in e.iter_nodes(): - for relation, dest in e.iter_edges(node): - if relation is IsRelatedTo: - relation = u"?y{}".format(y) - y += 1 - xs.append(triple(adapt(node), relation, adapt(dest), - indentation=1)) - sparql = template.format(preamble=settings.SPARQL_PREAMBLE, - select=select, - expression=u"\n".join(xs)) - return select, sparql - - -def triple(a, p, b, indentation=0): - a = escape(a) - b = escape(b) - p = escape(p) - s = _indent * indentation + u"{0} {1} {2}." - return s.format(a, p, b) diff --git a/quepy/tagger.py.bak b/quepy/tagger.py.bak deleted file mode 100644 index 557e093..0000000 --- a/quepy/tagger.py.bak +++ /dev/null @@ -1,74 +0,0 @@ -# coding: utf-8 - -# Copyright (c) 2012, Machinalis S.R.L. -# This file is part of quepy and is distributed under the Modified BSD License. -# You should have received a copy of license in the LICENSE file. -# -# Authors: Rafael Carrascosa -# Gonzalo Garcia Berrotaran - -import logging - -from quepy import settings -from quepy.encodingpolicy import assert_valid_encoding - -logger = logging.getLogger("quepy.tagger") -PENN_TAGSET = set(u"$ `` '' ( ) , -- . : CC CD DT EX FW IN JJ JJR JJS LS MD " - "NN NNP NNPS NNS PDT POS PRP PRP$ RB RBR RBS RP SYM TO UH " - "VB VBD VBG VBN VBP VBZ WDT WP WP$ WRB".split()) - - -class TaggingError(Exception): - """ - Error parsing tagger's output. - """ - pass - - -class Word(object): - """ - Representation of a tagged word. - Contains *token*, *lemma*, *pos tag* and optionally a *probability* of - that tag. - """ - _encoding_attrs = u"token lemma pos".split() - _attrs = _encoding_attrs + [u"prob"] - - def __init__(self, token, lemma=None, pos=None, prob=None): - self.pos = pos - self.prob = prob - self.lemma = lemma - self.token = token - - def __setattr__(self, name, value): - if name in self._encoding_attrs and value is not None: - assert_valid_encoding(value) - object.__setattr__(self, name, value) - - def __unicode__(self): - attrs = (getattr(self, name, u"-") for name in self._attrs) - return u"|".join(str(x) for x in attrs) - - def __repr__(self): - return unicode(self) - - -def get_tagger(): - """ - Return a tagging function given some app settings. - `Settings` is the settings module of an app. - The returned value is a function that receives a unicode string and returns - a list of `Word` instances. - """ - from quepy.nltktagger import run_nltktagger - tagger_function = lambda x: run_nltktagger(x, settings.NLTK_DATA_PATH) - - def wrapper(string): - assert_valid_encoding(string) - words = tagger_function(string) - for word in words: - if word.pos not in PENN_TAGSET: - logger.warning("Tagger emmited a non-penn " - "POS tag {!r}".format(word.pos)) - return words - return wrapper diff --git a/tests/random_expression.py.bak b/tests/random_expression.py.bak deleted file mode 100644 index d223a07..0000000 --- a/tests/random_expression.py.bak +++ /dev/null @@ -1,74 +0,0 @@ -# -*- coding: utf-8 -*- -import random -from quepy.expression import Expression - - -def random_data(only_ascii=False): - data = [] - first = True - while first or 1 / 20.0 < random.random(): - first = False - if only_ascii: - c = unichr(random.randint(33, 126)) - data.append(c) - continue - x = random.random() - if 0.1 > x: - c = random.choice(u" ./\n") - elif 0.50 > x: - c = unichr(random.randint(65, 122)) - elif 0.85 > x: - c = unichr(random.randint(0, 127)) - else: - c = unichr(random.randint(0, 65535)) - data.append(c) - return u"".join(data) - - -def random_relation(only_ascii=False): - data = random_data(only_ascii) - data = data.replace(" ", "") - if random.random() > 0.05: - return data - - class UnicodeableDummy(object): - def __unicode__(self): - return data - return UnicodeableDummy() - - -def random_expression(only_ascii=False): - """ - operations: new node, add data, decapitate, merge - """ - mean_size = 20 - xs = [40.0, 30.0, 50.0, 20.0] - xs = [x * (1.0 - random.random()) for x in xs] - assert all(x != 0 for x in xs) - new_node, add_data, decapitate, _ = [x / sum(xs) for x in xs] - expressions = [Expression(), Expression(), Expression(), Expression()] - while len(expressions) != 1: - if (1.0 / mean_size) < random.random(): - # Will start to merge more and will not create new nodes - new_node = 0.0 - # Choose action - r = random.random() - if r < new_node: - # New expression - expressions.append(Expression()) - elif r < add_data + new_node: - # Add data - e = random.choice(expressions) - e.add_data(random_relation(only_ascii), random_data(only_ascii)) - elif r < decapitate + add_data + new_node: - # Decapitate - e = random.choice(expressions) - e.decapitate(random_relation(only_ascii), - reverse=(0.25 < random.random())) - elif len(expressions) != 1: - # Merge - random.shuffle(expressions) - e2 = expressions.pop() - e1 = expressions[-1] - e1 += e2 - return expressions[0] diff --git a/tests/test_dot_generation.py.bak b/tests/test_dot_generation.py.bak deleted file mode 100644 index 2b8d68f..0000000 --- a/tests/test_dot_generation.py.bak +++ /dev/null @@ -1,87 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright (c) 2012, Machinalis S.R.L. -# This file is part of quepy and is distributed under the Modified BSD License. -# You should have received a copy of license in the LICENSE file. -# -# Authors: Rafael Carrascosa -# Gonzalo Garcia Berrotaran - -import unittest -import tempfile -import subprocess -from random_expression import random_expression -from random import seed -from quepy.dot_generation import expression_to_dot -from quepy.dsl import FixedRelation, FixedType, \ - FixedDataRelation - - -def gen_datarel(rel, data): - class X(FixedDataRelation): - relation = rel - return X(data) - - -def gen_fixedtype(type_): - class X(FixedType): - fixedtype = type_ - return X() - - -def gen_fixedrelation(rel, e): - class X(FixedRelation): - relation = rel - return X(e) - - -class TestDotGeneration(unittest.TestCase): - - def _standard_check(self, s, e): - self.assertIsInstance(s, unicode) - vs = [u"x{}".format(i) for i in xrange(len(e))] - for var in vs: - self.assertIn(var, s) - - def test_dot_takes_unicode(self): - e = gen_fixedtype(u"·̣─@łæßð~¶½") - e += gen_datarel(u"tµŧurułej€", u"←ðßðæßđæßæđßŋŋæ @~~·ŋŋ·¶·ŋ“¶¬@@") - _, s = expression_to_dot(e) - self._standard_check(s, e) - - def test_dot_takes_fails_ascii1(self): - e = gen_fixedtype("a") - e += gen_datarel("b", "c") - e = gen_fixedrelation("d", e) - self.assertRaises(ValueError, expression_to_dot, e) - - def test_dot_takes_fails_ascii2(self): - e = gen_fixedtype("·̣─@łæßð~¶½") - e += gen_datarel("tµŧurułej€", "←ðßðæßđæßæđßŋŋæ @~~·ŋŋ·¶·ŋ“¶¬@@") - self.assertRaises(ValueError, expression_to_dot, e) - - def test_dot_stress(self): - seed("I have come here to chew bubblegum and kick ass... " - "and I'm all out of bubblegum.") - dot_file = tempfile.NamedTemporaryFile() - cmdline = "dot %s" % dot_file.name - msg = "dot returned error code {}, check {} input file." - for _ in xrange(100): - expression = random_expression() - _, dot_string = expression_to_dot(expression) - with open(dot_file.name, "w") as filehandler: - filehandler.write(dot_string.encode("utf-8")) - - try: - retcode = subprocess.call(cmdline.split(), - stdout=tempfile.TemporaryFile()) - except OSError: - print "Warning: the program 'dot' was not found, tests skipped" - return - if retcode != 0: - dot_file.delete = False - self.assertEqual(retcode, 0, msg.format(retcode, dot_file.name)) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/test_dsl.py.bak b/tests/test_dsl.py.bak deleted file mode 100644 index 792fc91..0000000 --- a/tests/test_dsl.py.bak +++ /dev/null @@ -1,97 +0,0 @@ -# coding: utf-8 - -# Copyright (c) 2012, Machinalis S.R.L. -# This file is part of quepy and is distributed under the Modified BSD License. -# You should have received a copy of license in the LICENSE file. -# -# Authors: Rafael Carrascosa -# Gonzalo Garcia Berrotaran - -import unittest -from quepy.expression import Expression -from quepy.dsl import HasKeyword, FixedRelation, FixedType, \ - FixedDataRelation - - -class TestDSL(unittest.TestCase): - def test_fixed_relation(self): - - class MyFixedRelation(FixedRelation): - relation = u"uranium:blowtorch" - - empty = Expression() - fixedinstance = MyFixedRelation(empty) - - head = fixedinstance.get_head() - relations = [x[0] for x in fixedinstance.iter_edges(head)] - - self.assertIn(u"uranium:blowtorch", relations) - - def test_fixed_type(self): - - class MyFixedType(FixedType): - fixedtype = u"uranium:blowtorch" - fixedtyperelation = u"rdf:type" - - fixedinstance = MyFixedType() - - head = fixedinstance.get_head() - edges = list(fixedinstance.iter_edges(head)) - - self.assertEqual(len(edges), 1) - self.assertIsInstance(edges[0][0], unicode) - self.assertEqual(edges[0][0], u"rdf:type") - self.assertIsInstance(edges[0][1], unicode) - self.assertEqual(edges[0][1], u"uranium:blowtorch") - - def test_fixed_data_relation(self): - - class MyFixedDataRelation(FixedDataRelation): - relation = u"uranium:blowtorch" - - fixedinstance = MyFixedDataRelation(u"soplete") - head = fixedinstance.get_head() - edges = list(fixedinstance.iter_edges(head)) - - self.assertEqual(len(edges), 1) - self.assertIsInstance(edges[0][0], unicode) - self.assertEqual(edges[0][0], u"uranium:blowtorch") - self.assertIsInstance(edges[0][1], unicode) - self.assertEqual(edges[0][1], u"soplete") - - def test_has_keyword(self): - - HasKeyword.relation = u"uranium:keyword" - keywordinstance = HasKeyword(u"soplete") - - head = keywordinstance.get_head() - edges = list(keywordinstance.iter_edges(head)) - self.assertEqual(len(edges), 1) - self.assertIsInstance(edges[0][0], unicode) - self.assertEqual(edges[0][0], u"uranium:keyword") - self.assertIsInstance(edges[0][1], unicode) - self.assertEqual(edges[0][1], u'soplete') - - # With language - HasKeyword.language = "en" - keywordinstance = HasKeyword("soplete") - - head = keywordinstance.get_head() - edges = list(keywordinstance.iter_edges(head)) - self.assertEqual(len(edges), 1) - self.assertIsInstance(edges[0][1], unicode) - self.assertEqual(edges[0][1], u'"soplete"@en') - - # With sanitize - HasKeyword.sanitize = staticmethod(lambda x: x.upper()) - keywordinstance = HasKeyword(u"soplete") - - head = keywordinstance.get_head() - edges = list(keywordinstance.iter_edges(head)) - self.assertEqual(len(edges), 1) - self.assertIsInstance(edges[0][1], unicode) - self.assertEqual(edges[0][1], u'"SOPLETE"@en') - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/test_expressions.py.bak b/tests/test_expressions.py.bak deleted file mode 100644 index 639d000..0000000 --- a/tests/test_expressions.py.bak +++ /dev/null @@ -1,279 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -# Copyright (c) 2012, Machinalis S.R.L. -# This file is part of quepy and is distributed under the Modified BSD License. -# You should have received a copy of license in the LICENSE file. -# -# Authors: Rafael Carrascosa -# Gonzalo Garcia Berrotaran - -""" -Tests for expressions. -""" - -import unittest -from quepy.expression import Expression, isnode - - -def make_canonical_expression(e): - i = 0 - q = [e.get_head()] - seen = set() - while i != len(q): - node = q[i] - i += 1 - assert node not in seen, "Nouuu, expression is cyclic!" - for relation, child in e.iter_edges(node): - if isnode(child): - q.append(child) - q.reverse() - canon = {} - for node in q: - childs = [] - for label, child in e.iter_edges(node): - if isnode(child): - child = canon[child] - childs.append((label, child)) - childs.sort() - canon[node] = tuple(childs) - return canon[e.get_head()] - - -class ExpressionTests(object): - def test_acyclic(self): - head = self.e.get_head() - q = [head] - seen = set() - while q: - current = q.pop() - self.assertNotIn(current, seen) - seen.add(current) - for relation, child in self.e.iter_edges(current): - if isnode(child): - q.append(child) - - def test_non_empty(self): - self.assertNotEqual(len(self.e), 0) - - def test_add_data(self): - rel = u"|@·~½" - data = "somedata" - self.e.add_data(rel, data) - xs = list(self.e.iter_edges(self.e.get_head())) - self.assertIn((rel, data), xs) - - def test_decapitate(self): - oldhead = self.e.get_head() - self.e.decapitate("blabla") - self.assertNotEqual(oldhead, self.e.get_head()) - xs = list(self.e.iter_edges(self.e.get_head())) - self.assertEqual(xs, [("blabla", oldhead)]) - - def test_merges1(self): - oldlen = len(self.e) - oldhead = self.e.get_head() - other = Expression() - other.decapitate("blabla") - self.e.merge(other) - self.assertEqual(self.e.get_head(), oldhead) - self.assertEqual(len(self.e), oldlen + len(other) - 1) - - def test_merges2(self): - other = Expression() - other.decapitate("blabla") - oldlen = len(other) - oldhead = other.get_head() - other.merge(self.e) - self.assertEqual(other.get_head(), oldhead) - self.assertEqual(len(other), oldlen + len(self.e) - 1) - - def test_plus_makes_copy(self): - other = Expression() - other.decapitate("blabla") - a = self.e + other - self.assertFalse(a is other or self.e is other or a is self.e) - - def test_plus_is_conmutative(self): - other = Expression() - other.decapitate("blabla") - a = self.e + other - b = other + self.e - self.assertEqual(make_canonical_expression(a), - make_canonical_expression(b)) - - def test_plus_is_conmutative2(self): - other = Expression() - other.decapitate("blabla") - a = self.e + other + self.e - b = other + self.e + self.e - self.assertEqual(make_canonical_expression(a), - make_canonical_expression(b)) - - -class TestExpression1(unittest.TestCase, ExpressionTests): - def setUp(self): - self.e = Expression() - - -class TestExpression2(unittest.TestCase, ExpressionTests): - def setUp(self): - self.e = Expression() - self.e.add_data("key", "1") - self.e.add_data("key", "2") - self.e.add_data(u"~·~··@↓", None) - self.e.add_data(None, None) - - -class TestExpression3(unittest.TestCase, ExpressionTests): - def setUp(self): - self.e = Expression() - self.e.add_data("key", "1") - self.e.decapitate(u"µ") - self.e.add_data("a", "2") - self.e.add_data("a", "3") - self.e.add_data(None, None) - self.e.decapitate(None) - self.e.add_data(None, None) - - -class TestExpression4(unittest.TestCase, ExpressionTests): - def setUp(self): - self.e = Expression() - self.e.add_data(123, "456") - other = Expression() - other.add_data(0, "1") - other.add_data(2, "3") - other.decapitate("iuju") - for _ in xrange(5): - self.e.decapitate("nouu") - self.e += other - - -class CanonEqualTest(object): - def test_are_the_same(self): - a = make_canonical_expression(self.a) - b = make_canonical_expression(self.b) - self.assertEqual(a, b) - - -class CanonNotEqualTest(object): - def test_are_the_same(self): - a = make_canonical_expression(self.a) - b = make_canonical_expression(self.b) - self.assertNotEqual(a, b) - - -class TestCanon1(unittest.TestCase, CanonEqualTest): - def setUp(self): - self.a = Expression() - self.b = Expression() - - -class TestCanon2(unittest.TestCase, CanonEqualTest): - def setUp(self): - self.a = Expression() - self.a.add_data(None, "1") - self.a.add_data(None, "2") - self.b = Expression() - self.b.add_data(None, "2") - self.b.add_data(None, "1") - - -class TestCanon3(unittest.TestCase, CanonEqualTest): - def setUp(self): - A = Expression() - A.add_data("bla", "somedata") - A.decapitate("hier") - B = Expression() - B.add_data("ble", "otherdata") - B.decapitate("hier") - self.a = A + B - self.b = B + A - - -class TestCanon4(unittest.TestCase, CanonEqualTest): - def setUp(self): - A = Expression() - A.add_data("bla", "somedata") - A.decapitate("hier") - B = Expression() - B.add_data("ble", "otherdata") - B.decapitate("hier") - C = A + B - C.decapitate("hier") - C += B - C.decapitate("hier") - self.a = C + A - D = B + A - D.decapitate("hier") - D += B - D.decapitate("hier") - self.b = D + A - - -class TestCanon95(unittest.TestCase, CanonNotEqualTest): - def setUp(self): - self.a = Expression() - self.a.decapitate("onelevel") - - self.b = Expression() - self.b.decapitate("onelevel", reverse=True) - - -class TestCanon96(unittest.TestCase, CanonNotEqualTest): - def setUp(self): - self.a = Expression() - self.a.add_data(0, "data") - self.a.decapitate("onelevel") - - self.b = Expression() - self.b.add_data(0, "data") - self.b.decapitate("onelevel", reverse=True) - - -class TestCanon97(unittest.TestCase, CanonNotEqualTest): - def setUp(self): - other = Expression() - other.decapitate("onelevel") - self.a = Expression() - for _ in xrange(5): - self.a.decapitate("step") - self.a += other - - other = Expression() - other.decapitate("onelevel", reverse=True) - self.b = Expression() - for _ in xrange(5): - self.b.decapitate("step") - self.b += other - - -class TestCanon98(unittest.TestCase, CanonNotEqualTest): - def setUp(self): - other = Expression() - other.add_data(0, "data") - other.decapitate("onelevel") - self.a = Expression() - for _ in xrange(5): - self.a.decapitate("step") - self.a += other - - other = Expression() - other.add_data(0, "data") - other.decapitate("onelevel", reverse=True) - self.b = Expression() - for _ in xrange(5): - self.b.decapitate("step") - self.b += other - - -class TestCanon99(unittest.TestCase, CanonNotEqualTest): - def setUp(self): - self.a = Expression() - self.b = Expression() - self.b.decapitate("relation") - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/test_mql_generation.py.bak b/tests/test_mql_generation.py.bak deleted file mode 100644 index d437ce1..0000000 --- a/tests/test_mql_generation.py.bak +++ /dev/null @@ -1,68 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright (c) 2012, Machinalis S.R.L. -# This file is part of quepy and is distributed under the Modified BSD License. -# You should have received a copy of license in the LICENSE file. -# -# Authors: Rafael Carrascosa -# Gonzalo Garcia Berrotaran - -import json -from random import seed -import unittest -from random_expression import random_expression -from quepy.mql_generation import generate_mql - - -class TestMqlGeneration(unittest.TestCase): - def _get_json(self, query): - try: - return json.loads(query) - except ValueError as e: - if "Unpaired" in str(e) and "surrogate" in str(e): - # This is a known issue python's json. - return None - - def _valid_mql_query(self, query): - x = self._get_json(query) - if x is None: - return - q = [x] - while q: - x = q.pop() - # Each entry is either a [{...}] or a unicode - if isinstance(x, list): - self.assertIsInstance(x[0], dict) - self.assertEqual(len(x), 1) - for key, value in x[0].iteritems(): - self.assertIsInstance(key, unicode) - q.append(value) - else: - self.assertIsInstance(x, unicode) - - def _valid_target_for_query(self, target, query): - self.assertIsInstance(target, list) - for entry in target: - self.assertIsInstance(entry, unicode) - x = self._get_json(query) - if x is None: - return - target = list(target) - while target: - entry = target.pop(0) - x = x[0][entry] - self.assertIsInstance(x, list) - self.assertEqual(len(x), 1) - self.assertIsInstance(x[0], dict) - #self.assertEqual(len(x[0]), 0) # Too strict? - - def test_mql_stress(self): - seed("playadito vs amanda... 3 focas") - for _ in xrange(100): - expression = random_expression() - target, mql = generate_mql(expression) - self._valid_mql_query(mql) - self._valid_target_for_query(target, mql) - -if __name__ == "__main__": - unittest.main() diff --git a/tests/test_nltktagger.py.bak b/tests/test_nltktagger.py.bak deleted file mode 100644 index e45e272..0000000 --- a/tests/test_nltktagger.py.bak +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -# Copyright (c) 2012, Machinalis S.R.L. -# This file is part of quepy and is distributed under the Modified BSD License. -# You should have received a copy of license in the LICENSE file. -# -# Authors: Rafael Carrascosa -# Gonzalo Garcia Berrotaran - -""" -Tests for nltktagger. -""" - -import unittest -from quepy import nltktagger -from quepy.tagger import Word - - -class TestNLTKTagger(unittest.TestCase): - def test_word_output(self): - output = nltktagger.run_nltktagger(u"this is a test case «¢ðßæŋħħ") - - self.assertIsInstance(output, list) - for word in output: - self.assertIsInstance(word, Word) - - def tests_wrong_input(self): - self.assertRaises(ValueError, nltktagger.run_nltktagger, - "this is not unicode") diff --git a/tests/test_parsing.py.bak b/tests/test_parsing.py.bak deleted file mode 100644 index b58742d..0000000 --- a/tests/test_parsing.py.bak +++ /dev/null @@ -1,128 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -# Copyright (c) 2012, Machinalis S.R.L. -# This file is part of quepy and is distributed under the Modified BSD License. -# You should have received a copy of license in the LICENSE file. -# -# Authors: Rafael Carrascosa -# Gonzalo Garcia Berrotaran - -""" -Tests for Regex module. -""" - -import unittest -from quepy.parsing import QuestionTemplate, Particle, Lemma -from quepy.tagger import Word - - -class Mockrule(object): - rulename = "Mock" - - -class TestQuestionTemplate(unittest.TestCase): - def setUp(self): - self.mockrule = Mockrule - - class SomeRegex(QuestionTemplate): - regex = Lemma(u"hello") - - def interpret(self, match): - return Mockrule - - class SomeRegexWithData(QuestionTemplate): - regex = Lemma(u"hello") - - def interpret(self, match): - return Mockrule, 42 - - self.regexinstance = SomeRegex() - self.regex_with_data = SomeRegexWithData() - - def test_match(self): - words = [Word(u"hi", u"hello")] - ir, userdata = self.regexinstance.get_interpretation(words) - self.assertTrue(ir is self.mockrule) - self.assertEqual(userdata, None) - - def test_no_match(self): - words = [Word(u"hi", u"hello"), Word(u"girl", u"girl")] - ir, userdata = self.regexinstance.get_interpretation(words) - self.assertEqual(ir, None) - self.assertEqual(userdata, None) - - def test_user_data(self): - words = [Word(u"hi", u"hello")] - _, userdata = self.regex_with_data.get_interpretation(words) - self.assertEqual(userdata, 42) - - def test_no_ir(self): - class SomeRegex(QuestionTemplate): - regex = Lemma(u"hello") - - regexinstance = SomeRegex() - words = [Word(u"hi", u"hello")] - self.assertRaises(NotImplementedError, - regexinstance.get_interpretation, words) - - def test_regex_empty(self): - class SomeRegex(QuestionTemplate): - def interpret(self, match): - return Mockrule, "YES!" - - regexinstance = SomeRegex() - words = [Word(u"hi", u"hello")] - ir, userdata = regexinstance.get_interpretation(words) - self.assertTrue(ir is Mockrule) - self.assertEqual(userdata, "YES!") - - def test_match_words(self): - class SomeRegex(QuestionTemplate): - def interpret(self, match): - return match - - words = [Word(u"|@€đ€łł@ð«|µnþ", u"hello"), Word(u"a", u"b", u"c")] - match, _ = SomeRegex().get_interpretation(words) - self.assertEqual(words, match.words) - - -class TestParticle(unittest.TestCase): - def setUp(self): - class Person(Particle): - regex = Lemma(u"Jim") | Lemma(u"Tonny") - - def interpret(self, match): - return match - - class PersonRegex(QuestionTemplate): - regex = Person() + Lemma(u"be") + Person(u"another") - - def interpret(self, match): - return match - - class PersonAsset(Person): - regex = Person() + Lemma(u"'s") + Lemma(u"car") - - class NestedParticleRegex(PersonRegex): - regex = PersonAsset() + Lemma(u"be") + Person(u"another") - - self.personregex = PersonRegex() - self.nestedregex = NestedParticleRegex() - - def test_attrs(self): - words = [Word(x, x) for x in u"Jim be Tonny".split()] - match, _ = self.personregex.get_interpretation(words) - self.assertEqual(match.another.words[0], words[-1]) - self.assertEqual(match.person.words[0], words[0]) - self.assertRaises(AttributeError, lambda: match.pirulo) - - def test_nested_particle(self): - words = [Word(x, x) for x in u"Jim 's car be Tonny".split()] - match, _ = self.nestedregex.get_interpretation(words) - self.assertEqual(match.personasset.words[0], words[0]) - self.assertRaises(AttributeError, lambda: match.personasset.another) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/test_quepyapp.py.bak b/tests/test_quepyapp.py.bak deleted file mode 100644 index 7beb106..0000000 --- a/tests/test_quepyapp.py.bak +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -# Copyright (c) 2012, Machinalis S.R.L. -# This file is part of quepy and is distributed under the Modified BSD License. -# You should have received a copy of license in the LICENSE file. -# -# Authors: Rafael Carrascosa -# Gonzalo Garcia Berrotaran - -""" -Tests for QuepyApp. -""" - -import unittest - -import quepy - - -class TestQuepyApp(unittest.TestCase): - - def setUp(self): - self.app = quepy.install("testapp") - - def test_get_query_types(self): - question = "What is this?" - target, query, userdata = self.app.get_query(question) - - self.assertIsInstance(target, unicode) - self.assertIsInstance(query, unicode) - - def test_get_user_data(self): - question = "user data" - target, query, userdata = self.app.get_query(question) - self.assertEqual(userdata, "") - - def test_priority(self): - question = "something something" - target, query, userdata = self.app.get_query(question) - self.assertEqual(userdata, 42) - - def test_config_is_saved(self): - from quepy import settings - self.assertIn("testapp", settings.SPARQL_PREAMBLE) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/test_sparql_generation.py.bak b/tests/test_sparql_generation.py.bak deleted file mode 100644 index 836f7f2..0000000 --- a/tests/test_sparql_generation.py.bak +++ /dev/null @@ -1,102 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright (c) 2012, Machinalis S.R.L. -# This file is part of quepy and is distributed under the Modified BSD License. -# You should have received a copy of license in the LICENSE file. -# -# Authors: Rafael Carrascosa -# Gonzalo Garcia Berrotaran - -import re -import unittest -from random_expression import random_expression -from random import seed -from quepy.sparql_generation import expression_to_sparql -from quepy.dsl import FixedRelation, FixedType, \ - FixedDataRelation - - -def gen_datarel(rel, data): - class X(FixedDataRelation): - relation = rel - return X(data) - - -def gen_fixedtype(type_): - class X(FixedType): - fixedtype = type_ - return X() - - -def gen_fixedrelation(rel, e): - class X(FixedRelation): - relation = rel - return X(e) - - -class TestSparqlGeneration(unittest.TestCase): - - _sparql_line = re.compile("\?x\d+ \S+ (?:\?x\d+|\".*\"|\S+?:\S+?)" - "(?:@\w+)?.", re.DOTALL) - _sparql_query_start = re.compile("SELECT DISTINCT .+ WHERE {(.+)}", - re.DOTALL) - - def _standard_check(self, s, e): - self.assertIsInstance(s, unicode) - vs = [u"x{}".format(i) for i in xrange(len(e))] - for var in vs: - self.assertIn(var, s) - - def _sparql_check(self, s): - m = self._sparql_query_start.search(s) - self.assertNotEqual(m, None, "Could not find query start ") - lines = m.group(1).split("\n") - for line in lines: - line = line.strip() - if line: - s = "Line out of format: {!r}\n".format(line) - self.assertNotEqual(self._sparql_line.match(line), None, s) - - def test_sparql_takes_unicode(self): - e = gen_fixedtype(u"·̣─@łæßð~¶½") - e += gen_datarel(u"tµŧurułej€", u"←ðßðæßđæßæđßŋŋæ @~~·ŋŋ·¶·ŋ“¶¬@@") - _, s = expression_to_sparql(e) - self._standard_check(s, e) - self._sparql_check(s) - - @unittest.skip("should be fixed") - def test_sparql_ascii_stress(self): - seed("sacala dunga dunga dunga") - for _ in xrange(100): - expression = random_expression(only_ascii=True) - _, s = expression_to_sparql(expression) - self._standard_check(s, expression) - self._sparql_check(s) - - def test_sparql_stress(self): - seed("sacala dunga dunga dunga") - for _ in xrange(100): - expression = random_expression() - try: - _, s = expression_to_sparql(expression) - except ValueError as error: - if "Unable to generate sparql" in str(error): - continue - - self._standard_check(s, expression) - self._sparql_check(s) - - def test_sparql_takes_fails_ascii1(self): - e = gen_fixedtype("a") - e += gen_datarel("b", "c") - e = gen_fixedrelation("d", e) - self.assertRaises(ValueError, expression_to_sparql, e) - - def test_sparql_takes_fails_ascii2(self): - e = gen_fixedtype("·̣─@łæßð~¶½") - e += gen_datarel("tµŧurułej€", "←ðßðæßđæßæđßŋŋæ @~~·ŋŋ·¶·ŋ“¶¬@@") - self.assertRaises(ValueError, expression_to_sparql, e) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/test_tagger.py.bak b/tests/test_tagger.py.bak deleted file mode 100644 index 39be54a..0000000 --- a/tests/test_tagger.py.bak +++ /dev/null @@ -1,67 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -# Copyright (c) 2012, Machinalis S.R.L. -# This file is part of quepy and is distributed under the Modified BSD License. -# You should have received a copy of license in the LICENSE file. -# -# Authors: Rafael Carrascosa -# Gonzalo Garcia Berrotaran - -""" -Tests for tagger. -""" - -import unittest -from quepy import tagger - - -class TestTagger(unittest.TestCase): - def test_tagset_unicode(self): - for tag in tagger.PENN_TAGSET: - self.assertIsInstance(tag, unicode) - - def test_word_encoding(self): - word = tagger.Word(token=u"æßđħłłþłłł@æµß", - lemma=u"ŧłþłßæ#¶ŋħ~#~@", - pos=u"øĸŋøħþ€ĸłþ€øæ«»¢") - - self.assertIsInstance(word.token, unicode) - self.assertEqual(word.token, u"æßđħłłþłłł@æµß") - self.assertIsInstance(word.lemma, unicode) - self.assertEqual(word.lemma, u"ŧłþłßæ#¶ŋħ~#~@") - self.assertIsInstance(word.pos, unicode) - self.assertEqual(word.pos, u"øĸŋøħþ€ĸłþ€øæ«»¢") - - def test_word_wrong_encoding(self): - # Token not unicode - self.assertRaises(ValueError, tagger.Word, "æßđħłłþłłł@æµß", - u"ŧłþłßæ#¶ŋħ~#~@", u"øĸŋøħþ€ĸłþ€øæ«»¢") - # Lemma not unicode - self.assertRaises(ValueError, tagger.Word, u"æßđħłłþłłł@æµß", - "ŧłþłßæ#¶ŋħ~#~@", u"øĸŋøħþ€ĸłþ€øæ«»¢") - # Pos not unicode - self.assertRaises(ValueError, tagger.Word, u"æßđħłłþłłł@æµß", - u"ŧłþłßæ#¶ŋħ~#~@", "øĸŋøħþ€ĸłþ€øæ«»¢") - - def test_word_attrib_set(self): - word = tagger.Word(u"æßđħłłþłłł@æµß") - word.lemma = u"ŧłþłßæ#¶ŋħ~#~@" - word.pos = u"øĸŋøħþ€ĸłþ€øæ«»¢" - - self.assertIsInstance(word.token, unicode) - self.assertEqual(word.token, u"æßđħłłþłłł@æµß") - self.assertIsInstance(word.lemma, unicode) - self.assertEqual(word.lemma, u"ŧłþłßæ#¶ŋħ~#~@") - self.assertIsInstance(word.pos, unicode) - self.assertEqual(word.pos, u"øĸŋøħþ€ĸłþ€øæ«»¢") - - def test_word_wrong_attrib_set(self): - word = tagger.Word(u"æßđħłłþłłł@æµß") - - # Token not unicode - self.assertRaises(ValueError, setattr, word, "token", "æßđħłłþłłł@æµß") - # Lemma not unicode - self.assertRaises(ValueError, setattr, word, "lemma", "ŧłþłßæ#¶ŋħ~#~@") - # Pos not unicode - self.assertRaises(ValueError, setattr, word, "pos", "øĸŋøħþ€ĸłþ€øæ«»¢") From 344a666480adec1b5e05f609772f5110410d3b88 Mon Sep 17 00:00:00 2001 From: Bowen Fu Date: Tue, 22 Aug 2017 14:14:56 +0800 Subject: [PATCH 05/10] remove cntagger and jiebatagger --- quepy/cntagger.py | 74 -------------------------------------------- quepy/jiebatagger.py | 67 --------------------------------------- 2 files changed, 141 deletions(-) delete mode 100644 quepy/cntagger.py delete mode 100644 quepy/jiebatagger.py diff --git a/quepy/cntagger.py b/quepy/cntagger.py deleted file mode 100644 index 9f50366..0000000 --- a/quepy/cntagger.py +++ /dev/null @@ -1,74 +0,0 @@ -# coding: utf-8 - -# Copyright (c) 2012, Machinalis S.R.L. -# This file is part of quepy and is distributed under the Modified BSD License. -# You should have received a copy of license in the LICENSE file. -# -# Authors: Rafael Carrascosa -# Gonzalo Garcia Berrotaran - -import logging - -from quepy import settings -from quepy.encodingpolicy import assert_valid_encoding - -logger = logging.getLogger("quepy.tagger") -PENN_TAGSET = set("$ `` '' ( ) , -- . : CC CD DT EX FW IN JJ JJR JJS LS MD " - "NN NNP NNPS NNS PDT POS PRP PRP$ RB RBR RBS RP SYM TO UH " - "VB VBD VBG VBN VBP VBZ WDT WP WP$ WRB".split()) - - -class TaggingError(Exception): - """ - Error parsing tagger's output. - """ - pass - - -class Word(object): - """ - Representation of a tagged word. - Contains *token*, *lemma*, *pos tag* and optionally a *probability* of - that tag. - """ - _encoding_attrs = "token lemma pos".split() - _attrs = _encoding_attrs + ["prob"] - - def __init__(self, token, lemma=None, pos=None, prob=None): - self.pos = pos - self.prob = prob - self.lemma = lemma - self.token = token - - def __setattr__(self, name, value): - if name in self._encoding_attrs and value is not None: - assert_valid_encoding(value) - object.__setattr__(self, name, value) - - def __unicode__(self): - attrs = (getattr(self, name, "-") for name in self._attrs) - return "|".join(str(x) for x in attrs) - - def __repr__(self): - return str(self) - - -def get_cntagger(): - """ - Return a tagging function given some app settings. - `Settings` is the settings module of an app. - The returned value is a function that receives a unicode string and returns - a list of `Word` instances. - """ - from quepy.jiebatagger import run_jiebatagger - tagger_function = lambda x: run_jiebaagger(x) - - def wrapper(string): - assert_valid_encoding(string) - words = tagger_function(string) - for word in words: - if word.pos not in PENN_TAGSET: - logger.warning("Tagger emmited a non-penn " - "POS tag {!r}".format(word.pos)) - return words - return wrapper diff --git a/quepy/jiebatagger.py b/quepy/jiebatagger.py deleted file mode 100644 index 34ab95a..0000000 --- a/quepy/jiebatagger.py +++ /dev/null @@ -1,67 +0,0 @@ -# coding: utf-8 - -# Copyright (c) 2012, Machinalis S.R.L. -# This file is part of quepy and is distributed under the Modified BSD License. -# You should have received a copy of license in the LICENSE file. -# -# Authors: Rafael Carrascosa -# Gonzalo Garcia Berrotaran - -""" -Tagging using Jieba. -""" - -# Requiered data files are: -# - "averaged_perceptron_tagger" in Models -# - "wordnet" in Corpora - -import jieba -from quepy.tagger import Word -from quepy.encodingpolicy import assert_valid_encoding - -_penn_to_morphy_tag = {} - - -def penn_to_morphy_tag(tag): - assert_valid_encoding(tag) - - for penn, morphy in _penn_to_morphy_tag.items(): - if tag.startswith(penn): - return morphy - return None - - -def run_jiebatagger(string): - """ - Runs jieba tagger on `string` and returns a list of - :class:`quepy.tagger.Word` objects. - """ - assert_valid_encoding(string) - global _penn_to_morphy_tag - - from nltk.corpus import wordnet - - if not _penn_to_morphy_tag: - _penn_to_morphy_tag = { - 'NN': wordnet.NOUN, - 'JJ': wordnet.ADJ, - 'VB': wordnet.VERB, - 'RB': wordnet.ADV, - } - - # Recommended tokenizer doesn't handle non-ascii characters very well - #tokens = jieba.word_tokenize(string) - token_tags = jieba.posseg.cut(string) - - words = [] - for token, pos in token_tags: - word = Word(token) - # Eliminates stuff like JJ|CC - word.pos = pos.split("|")[0] - - mtag = penn_to_morphy_tag(word.pos) - word.lemma = None - - words.append(word) - - return words From 66ae62f8f7c8bda70b33e49b23b2aca724c3789f Mon Sep 17 00:00:00 2001 From: Bowen Fu Date: Tue, 22 Aug 2017 14:16:15 +0800 Subject: [PATCH 06/10] remove bak --- examples/dbpedia/main.py.bak | 212 ---------------------------------- examples/freebase/main.py.bak | 67 ----------- tests/testapp/__init__.py.bak | 15 --- 3 files changed, 294 deletions(-) delete mode 100644 examples/dbpedia/main.py.bak delete mode 100644 examples/freebase/main.py.bak delete mode 100644 tests/testapp/__init__.py.bak diff --git a/examples/dbpedia/main.py.bak b/examples/dbpedia/main.py.bak deleted file mode 100644 index 39eee69..0000000 --- a/examples/dbpedia/main.py.bak +++ /dev/null @@ -1,212 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -# Copyright (c) 2012, Machinalis S.R.L. -# This file is part of quepy and is distributed under the Modified BSD License. -# You should have received a copy of license in the LICENSE file. -# -# Authors: Rafael Carrascosa -# Gonzalo Garcia Berrotaran - -""" -Main script for DBpedia quepy. -""" - -import sys -import time -import random -import datetime - -import quepy -from SPARQLWrapper import SPARQLWrapper, JSON - -sparql = SPARQLWrapper("http://dbpedia.org/sparql") -dbpedia = quepy.install("dbpedia") - -# quepy.set_loglevel("DEBUG") - - -def print_define(results, target, metadata=None): - for result in results["results"]["bindings"]: - if result[target]["xml:lang"] == "en": - print result[target]["value"] - print - - -def print_enum(results, target, metadata=None): - used_labels = [] - - for result in results["results"]["bindings"]: - if result[target]["type"] == u"literal": - if result[target]["xml:lang"] == "en": - label = result[target]["value"] - if label not in used_labels: - used_labels.append(label) - print label - - -def print_literal(results, target, metadata=None): - for result in results["results"]["bindings"]: - literal = result[target]["value"] - if metadata: - print metadata.format(literal) - else: - print literal - - -def print_time(results, target, metadata=None): - gmt = time.mktime(time.gmtime()) - gmt = datetime.datetime.fromtimestamp(gmt) - - for result in results["results"]["bindings"]: - offset = result[target]["value"].replace(u"−", u"-") - - if ("to" in offset) or ("and" in offset): - if "to" in offset: - connector = "and" - from_offset, to_offset = offset.split("to") - else: - connector = "or" - from_offset, to_offset = offset.split("and") - - from_offset, to_offset = int(from_offset), int(to_offset) - - if from_offset > to_offset: - from_offset, to_offset = to_offset, from_offset - - from_delta = datetime.timedelta(hours=from_offset) - to_delta = datetime.timedelta(hours=to_offset) - - from_time = gmt + from_delta - to_time = gmt + to_delta - - location_string = random.choice(["where you are", - "your location"]) - - print "Between %s %s %s, depending on %s" % \ - (from_time.strftime("%H:%M"), - connector, - to_time.strftime("%H:%M on %A"), - location_string) - - else: - offset = int(offset) - - delta = datetime.timedelta(hours=offset) - the_time = gmt + delta - - print the_time.strftime("%H:%M on %A") - - -def print_age(results, target, metadata=None): - assert len(results["results"]["bindings"]) == 1 - - birth_date = results["results"]["bindings"][0][target]["value"] - year, month, days = birth_date.split("-") - - birth_date = datetime.date(int(year), int(month), int(days)) - - now = datetime.datetime.utcnow() - now = now.date() - - age = now - birth_date - print "{} years old".format(age.days / 365) - - -def wikipedia2dbpedia(wikipedia_url): - """ - Given a wikipedia URL returns the dbpedia resource - of that page. - """ - - query = """ - PREFIX foaf: - SELECT * WHERE { - ?url foaf:isPrimaryTopicOf <%s>. - } - """ % wikipedia_url - - sparql.setQuery(query) - sparql.setReturnFormat(JSON) - results = sparql.query().convert() - - if not results["results"]["bindings"]: - print "Snorql URL not found" - sys.exit(1) - else: - return results["results"]["bindings"][0]["url"]["value"] - - -if __name__ == "__main__": - default_questions = [ - "What is a car?", - "Who is Tom Cruise?", - "Who is George Lucas?", - "Who is Mirtha Legrand?", - # "List Microsoft software", - "Name Fiat cars", - "time in argentina", - "what time is it in Chile?", - "List movies directed by Martin Scorsese", - "How long is Pulp Fiction", - "which movies did Mel Gibson starred?", - "When was Gladiator released?", - "who directed Pocahontas?", - "actors of Fight Club", - ] - - if "-d" in sys.argv: - quepy.set_loglevel("DEBUG") - sys.argv.remove("-d") - - if len(sys.argv) > 1: - question = " ".join(sys.argv[1:]) - - if question.count("wikipedia.org"): - print wikipedia2dbpedia(sys.argv[1]) - sys.exit(0) - else: - questions = [question] - else: - questions = default_questions - - print_handlers = { - "define": print_define, - "enum": print_enum, - "time": print_time, - "literal": print_literal, - "age": print_age, - } - - for question in questions: - print question - print "-" * len(question) - - target, query, metadata = dbpedia.get_query(question) - - if isinstance(metadata, tuple): - query_type = metadata[0] - metadata = metadata[1] - else: - query_type = metadata - metadata = None - - if query is None: - print "Query not generated :(\n" - continue - - print query - - if target.startswith("?"): - target = target[1:] - if query: - sparql.setQuery(query) - sparql.setReturnFormat(JSON) - results = sparql.query().convert() - - if not results["results"]["bindings"]: - print "No answer found :(" - continue - - print_handlers[query_type](results, target, metadata) - print diff --git a/examples/freebase/main.py.bak b/examples/freebase/main.py.bak deleted file mode 100644 index 4d3fe95..0000000 --- a/examples/freebase/main.py.bak +++ /dev/null @@ -1,67 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -""" -Main script for freebase quepy. - -Usage: - main.py [options] ... - -Options: - -r --request Queries the online database and prints the results -""" - -import json -import quepy -import urllib -from docopt import docopt - -service_url = 'https://www.googleapis.com/freebase/v1/mqlread' -freebase = quepy.install("freebase") - - -def request(query): - params = {'query': query} - url = service_url + '?' + urllib.urlencode(params) - responses = json.loads(urllib.urlopen(url).read()) - return responses - - -def result_from_responses(responses, target): - if responses: - to_explore = responses["result"] - for key in target: - _to_explore = [] - for elem in to_explore: - for response in elem[key]: - _to_explore.append(response) - to_explore = _to_explore - - result = [] - for elem in to_explore: - if isinstance(elem, dict): - if "lang" in elem: - if elem["lang"] == "/lang/en": - result.append(elem.get("value", elem)) - else: - result.append(elem.get("value", elem)) - else: - result.append(elem) - return result - - -if __name__ == "__main__": - args = docopt(__doc__) - question = " ".join(args[""]) - target, query, metadata = freebase.get_query(question) - print query - - if args["--request"]: - print - responses = request(query) - if "error" in responses: - print responses - exit() - else: - for response in result_from_responses(responses, target): - print response diff --git a/tests/testapp/__init__.py.bak b/tests/testapp/__init__.py.bak deleted file mode 100644 index 03df946..0000000 --- a/tests/testapp/__init__.py.bak +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -# Copyright (c) 2012, Machinalis S.R.L. -# This file is part of quepy and is distributed under the Modified BSD License. -# You should have received a copy of license in the LICENSE file. -# -# Authors: Rafael Carrascosa -# Gonzalo Garcia Berrotaran - -""" -Init for testapp quepy. -""" - -from basic import * \ No newline at end of file From d8ac1bf7f36635db5ae5d408f121badb78b5a89a Mon Sep 17 00:00:00 2001 From: Bowen Fu Date: Tue, 22 Aug 2017 14:17:52 +0800 Subject: [PATCH 07/10] remove bak --- examples/dbpedia/dbpedia/__init__.py.bak | 21 -- examples/dbpedia/dbpedia/basic.py.bak | 105 ---------- examples/dbpedia/dbpedia/country.py.bak | 99 ---------- examples/dbpedia/dbpedia/movies.py.bak | 184 ------------------ examples/dbpedia/dbpedia/music.py.bak | 97 --------- examples/dbpedia/dbpedia/people.py.bak | 66 ------- .../dbpedia/dbpedia/populated_place.py.bak | 60 ------ examples/dbpedia/dbpedia/settings.py.bak | 34 ---- examples/dbpedia/dbpedia/tvshows.py.bak | 124 ------------ examples/dbpedia/dbpedia/writers.py.bak | 69 ------- examples/freebase/freebase/__init__.py.bak | 14 -- examples/freebase/freebase/basic.py.bak | 54 ----- examples/freebase/freebase/country.py.bak | 94 --------- examples/freebase/freebase/movies.py.bak | 184 ------------------ examples/freebase/freebase/music.py.bak | 97 --------- examples/freebase/freebase/people.py.bak | 65 ------- examples/freebase/freebase/tvshows.py.bak | 112 ----------- examples/freebase/freebase/writers.py.bak | 69 ------- 18 files changed, 1548 deletions(-) delete mode 100644 examples/dbpedia/dbpedia/__init__.py.bak delete mode 100644 examples/dbpedia/dbpedia/basic.py.bak delete mode 100644 examples/dbpedia/dbpedia/country.py.bak delete mode 100644 examples/dbpedia/dbpedia/movies.py.bak delete mode 100644 examples/dbpedia/dbpedia/music.py.bak delete mode 100644 examples/dbpedia/dbpedia/people.py.bak delete mode 100644 examples/dbpedia/dbpedia/populated_place.py.bak delete mode 100644 examples/dbpedia/dbpedia/settings.py.bak delete mode 100644 examples/dbpedia/dbpedia/tvshows.py.bak delete mode 100644 examples/dbpedia/dbpedia/writers.py.bak delete mode 100644 examples/freebase/freebase/__init__.py.bak delete mode 100644 examples/freebase/freebase/basic.py.bak delete mode 100644 examples/freebase/freebase/country.py.bak delete mode 100644 examples/freebase/freebase/movies.py.bak delete mode 100644 examples/freebase/freebase/music.py.bak delete mode 100644 examples/freebase/freebase/people.py.bak delete mode 100644 examples/freebase/freebase/tvshows.py.bak delete mode 100644 examples/freebase/freebase/writers.py.bak diff --git a/examples/dbpedia/dbpedia/__init__.py.bak b/examples/dbpedia/dbpedia/__init__.py.bak deleted file mode 100644 index 47605e7..0000000 --- a/examples/dbpedia/dbpedia/__init__.py.bak +++ /dev/null @@ -1,21 +0,0 @@ -# coding: utf-8 - -# Copyright (c) 2012, Machinalis S.R.L. -# This file is part of quepy and is distributed under the Modified BSD License. -# You should have received a copy of license in the LICENSE file. -# -# Authors: Rafael Carrascosa -# Gonzalo Garcia Berrotaran - -""" -DBpedia quepy. -""" - -from basic import * -from music import * -from movies import * -from people import * -from country import * -from populated_place import * -from tvshows import * -from writers import * diff --git a/examples/dbpedia/dbpedia/basic.py.bak b/examples/dbpedia/dbpedia/basic.py.bak deleted file mode 100644 index 16632f6..0000000 --- a/examples/dbpedia/dbpedia/basic.py.bak +++ /dev/null @@ -1,105 +0,0 @@ -# coding: utf-8 - -# Copyright (c) 2012, Machinalis S.R.L. -# This file is part of quepy and is distributed under the Modified BSD License. -# You should have received a copy of license in the LICENSE file. -# -# Authors: Rafael Carrascosa -# Gonzalo Garcia Berrotaran - -""" -Basic questions for DBpedia. -""" - -from refo import Group, Plus, Question -from quepy.parsing import Lemma, Pos, QuestionTemplate, Token, Particle, \ - Lemmas -from quepy.dsl import HasKeyword, IsRelatedTo, HasType -from dsl import DefinitionOf, LabelOf, IsPlace, \ - UTCof, LocationOf - - -# Openings -LISTOPEN = Lemma("list") | Lemma("name") - - -class Thing(Particle): - regex = Question(Pos("JJ")) + (Pos("NN") | Pos("NNP") | Pos("NNS")) |\ - Pos("VBN") - - def interpret(self, match): - return HasKeyword(match.words.tokens) - - -class WhatIs(QuestionTemplate): - """ - Regex for questions like "What is a blowtorch - Ex: "What is a car" - "What is Seinfield?" - """ - - regex = Lemma("what") + Lemma("be") + Question(Pos("DT")) + \ - Thing() + Question(Pos(".")) - - def interpret(self, match): - label = DefinitionOf(match.thing) - - return label, "define" - - -class ListEntity(QuestionTemplate): - """ - Regex for questions like "List Microsoft software" - """ - - entity = Group(Pos("NNP"), "entity") - target = Group(Pos("NN") | Pos("NNS"), "target") - regex = LISTOPEN + entity + target - - def interpret(self, match): - entity = HasKeyword(match.entity.tokens) - target_type = HasKeyword(match.target.lemmas) - target = HasType(target_type) + IsRelatedTo(entity) - label = LabelOf(target) - - return label, "enum" - - -class WhatTimeIs(QuestionTemplate): - """ - Regex for questions about the time - Ex: "What time is it in Cordoba" - """ - - nouns = Plus(Pos("NN") | Pos("NNS") | Pos("NNP") | Pos("NNPS")) - place = Group(nouns, "place") - openings = (Lemma("what") + - ((Token("is") + Token("the") + Question(Lemma("current")) + - Question(Lemma("local")) + Lemma("time")) | - (Lemma("time") + Token("is") + Token("it")))) | \ - Lemma("time") - regex = openings + Pos("IN") + place + Question(Pos(".")) - - def interpret(self, match): - place = HasKeyword(match.place.lemmas.title()) + IsPlace() - utc_offset = UTCof(place) - - return utc_offset, "time" - - -class WhereIsQuestion(QuestionTemplate): - """ - Ex: "where in the world is the Eiffel Tower" - """ - - thing = Group(Plus(Pos("IN") | Pos("NP") | Pos("NNP") | Pos("NNPS")), - "thing") - regex = Lemma("where") + Question(Lemmas("in the world")) + Lemma("be") + \ - Question(Pos("DT")) + thing + Question(Pos(".")) - - def interpret(self, match): - thing = HasKeyword(match.thing.tokens) - location = LocationOf(thing) - location_name = LabelOf(location) - - return location_name, "enum" diff --git a/examples/dbpedia/dbpedia/country.py.bak b/examples/dbpedia/dbpedia/country.py.bak deleted file mode 100644 index 509de27..0000000 --- a/examples/dbpedia/dbpedia/country.py.bak +++ /dev/null @@ -1,99 +0,0 @@ -# coding: utf-8 - -# Copyright (c) 2012, Machinalis S.R.L. -# This file is part of quepy and is distributed under the Modified BSD License. -# You should have received a copy of license in the LICENSE file. -# -# Authors: Rafael Carrascosa -# Gonzalo Garcia Berrotaran - -""" -Country related regex -""" - -from refo import Plus, Question -from quepy.dsl import HasKeyword -from quepy.parsing import Lemma, Pos, QuestionTemplate, Token, Particle -from dsl import IsCountry, IncumbentOf, CapitalOf, \ - LabelOf, LanguageOf, PopulationOf, PresidentOf - - -class Country(Particle): - regex = Plus(Pos("DT") | Pos("NN") | Pos("NNS") | Pos("NNP") | Pos("NNPS")) - - def interpret(self, match): - name = match.words.tokens.title() - return IsCountry() + HasKeyword(name) - - -class PresidentOfQuestion(QuestionTemplate): - """ - Regex for questions about the president of a country. - Ex: "Who is the president of Argentina?" - """ - - regex = Pos("WP") + Token("is") + Question(Pos("DT")) + \ - Lemma("president") + Pos("IN") + Country() + Question(Pos(".")) - - def interpret(self, match): - president = PresidentOf(match.country) - incumbent = IncumbentOf(president) - label = LabelOf(incumbent) - - return label, "enum" - - -class CapitalOfQuestion(QuestionTemplate): - """ - Regex for questions about the capital of a country. - Ex: "What is the capital of Bolivia?" - """ - - opening = Lemma("what") + Token("is") - regex = opening + Pos("DT") + Lemma("capital") + Pos("IN") + \ - Question(Pos("DT")) + Country() + Question(Pos(".")) - - def interpret(self, match): - capital = CapitalOf(match.country) - label = LabelOf(capital) - return label, "enum" - - -# FIXME: the generated query needs FILTER isLiteral() to the head -# because dbpedia sometimes returns different things -class LanguageOfQuestion(QuestionTemplate): - """ - Regex for questions about the language spoken in a country. - Ex: "What is the language of Argentina?" - "what language is spoken in Argentina?" - """ - - openings = (Lemma("what") + Token("is") + Pos("DT") + - Question(Lemma("official")) + Lemma("language")) | \ - (Lemma("what") + Lemma("language") + Token("is") + - Lemma("speak")) - - regex = openings + Pos("IN") + Question(Pos("DT")) + Country() + \ - Question(Pos(".")) - - def interpret(self, match): - language = LanguageOf(match.country) - return language, "enum" - - -class PopulationOfQuestion(QuestionTemplate): - """ - Regex for questions about the population of a country. - Ex: "What is the population of China?" - "How many people live in China?" - """ - - openings = (Pos("WP") + Token("is") + Pos("DT") + - Lemma("population") + Pos("IN")) | \ - (Pos("WRB") + Lemma("many") + Lemma("people") + - Token("live") + Pos("IN")) - regex = openings + Question(Pos("DT")) + Country() + Question(Pos(".")) - - def interpret(self, match): - population = PopulationOf(match.country) - return population, "literal" diff --git a/examples/dbpedia/dbpedia/movies.py.bak b/examples/dbpedia/dbpedia/movies.py.bak deleted file mode 100644 index 9a31750..0000000 --- a/examples/dbpedia/dbpedia/movies.py.bak +++ /dev/null @@ -1,184 +0,0 @@ -# coding: utf-8 - -# Copyright (c) 2012, Machinalis S.R.L. -# This file is part of quepy and is distributed under the Modified BSD License. -# You should have received a copy of license in the LICENSE file. -# -# Authors: Rafael Carrascosa -# Gonzalo Garcia Berrotaran - -""" -Movie related regex. -""" - -from refo import Plus, Question -from quepy.dsl import HasKeyword -from quepy.parsing import Lemma, Lemmas, Pos, QuestionTemplate, Particle -from dsl import IsMovie, NameOf, IsPerson, \ - DirectedBy, LabelOf, DurationOf, HasActor, HasName, ReleaseDateOf, \ - DirectorOf, StarsIn, DefinitionOf - -nouns = Plus(Pos("NN") | Pos("NNS") | Pos("NNP") | Pos("NNPS")) - - -class Movie(Particle): - regex = Question(Pos("DT")) + nouns - - def interpret(self, match): - name = match.words.tokens - return IsMovie() + HasName(name) - - -class Actor(Particle): - regex = nouns - - def interpret(self, match): - name = match.words.tokens - return IsPerson() + HasKeyword(name) - - -class Director(Particle): - regex = nouns - - def interpret(self, match): - name = match.words.tokens - return IsPerson() + HasKeyword(name) - - -class ListMoviesQuestion(QuestionTemplate): - """ - Ex: "list movies" - """ - - regex = Lemma("list") + (Lemma("movie") | Lemma("film")) - - def interpret(self, match): - movie = IsMovie() - name = NameOf(movie) - return name, "enum" - - -class MoviesByDirectorQuestion(QuestionTemplate): - """ - Ex: "List movies directed by Quentin Tarantino. - "movies directed by Martin Scorsese" - "which movies did Mel Gibson directed" - """ - - regex = (Question(Lemma("list")) + (Lemma("movie") | Lemma("film")) + - Question(Lemma("direct")) + Lemma("by") + Director()) | \ - (Lemma("which") + (Lemma("movie") | Lemma("film")) + Lemma("do") + - Director() + Lemma("direct") + Question(Pos("."))) - - def interpret(self, match): - movie = IsMovie() + DirectedBy(match.director) - movie_name = LabelOf(movie) - - return movie_name, "enum" - - -class MovieDurationQuestion(QuestionTemplate): - """ - Ex: "How long is Pulp Fiction" - "What is the duration of The Thin Red Line?" - """ - - regex = ((Lemmas("how long be") + Movie()) | - (Lemmas("what be") + Pos("DT") + Lemma("duration") + - Pos("IN") + Movie())) + \ - Question(Pos(".")) - - def interpret(self, match): - duration = DurationOf(match.movie) - return duration, ("literal", "{} minutes long") - - -class ActedOnQuestion(QuestionTemplate): - """ - Ex: "List movies with Hugh Laurie" - "Movies with Matt LeBlanc" - "In what movies did Jennifer Aniston appear?" - "Which movies did Mel Gibson starred?" - "Movies starring Winona Ryder" - """ - - acted_on = (Lemma("appear") | Lemma("act") | Lemma("star")) - movie = (Lemma("movie") | Lemma("movies") | Lemma("film")) - regex = (Question(Lemma("list")) + movie + Lemma("with") + Actor()) | \ - (Question(Pos("IN")) + (Lemma("what") | Lemma("which")) + - movie + Lemma("do") + Actor() + acted_on + Question(Pos("."))) | \ - (Question(Pos("IN")) + Lemma("which") + movie + Lemma("do") + - Actor() + acted_on) | \ - (Question(Lemma("list")) + movie + Lemma("star") + Actor()) - - def interpret(self, match): - movie = IsMovie() + HasActor(match.actor) - movie_name = NameOf(movie) - return movie_name, "enum" - - -class MovieReleaseDateQuestion(QuestionTemplate): - """ - Ex: "When was The Red Thin Line released?" - "Release date of The Empire Strikes Back" - """ - - regex = ((Lemmas("when be") + Movie() + Lemma("release")) | - (Lemma("release") + Question(Lemma("date")) + - Pos("IN") + Movie())) + \ - Question(Pos(".")) - - def interpret(self, match): - release_date = ReleaseDateOf(match.movie) - return release_date, "literal" - - -class DirectorOfQuestion(QuestionTemplate): - """ - Ex: "Who is the director of Big Fish?" - "who directed Pocahontas?" - """ - - regex = ((Lemmas("who be") + Pos("DT") + Lemma("director") + - Pos("IN") + Movie()) | - (Lemma("who") + Lemma("direct") + Movie())) + \ - Question(Pos(".")) - - def interpret(self, match): - director = IsPerson() + DirectorOf(match.movie) - director_name = NameOf(director) - return director_name, "literal" - - -class ActorsOfQuestion(QuestionTemplate): - """ - Ex: "who are the actors of Titanic?" - "who acted in Alien?" - "who starred in Depredator?" - "Actors of Fight Club" - """ - - regex = (Lemma("who") + Question(Lemma("be") + Pos("DT")) + - (Lemma("act") | Lemma("actor") | Lemma("star")) + - Pos("IN") + Movie() + Question(Pos("."))) | \ - ((Lemma("actors") | Lemma("actor")) + Pos("IN") + Movie()) - - def interpret(self, match): - actor = NameOf(IsPerson() + StarsIn(match.movie)) - return actor, "enum" - - -class PlotOfQuestion(QuestionTemplate): - """ - Ex: "what is Shame about?" - "plot of Titanic" - """ - - regex = ((Lemmas("what be") + Movie() + Lemma("about")) | \ - (Question(Lemmas("what be the")) + Lemma("plot") + - Pos("IN") + Movie())) + \ - Question(Pos(".")) - - def interpret(self, match): - definition = DefinitionOf(match.movie) - return definition, "define" diff --git a/examples/dbpedia/dbpedia/music.py.bak b/examples/dbpedia/dbpedia/music.py.bak deleted file mode 100644 index 006371b..0000000 --- a/examples/dbpedia/dbpedia/music.py.bak +++ /dev/null @@ -1,97 +0,0 @@ -# coding: utf-8 - -# Copyright (c) 2012, Machinalis S.R.L. -# This file is part of quepy and is distributed under the Modified BSD License. -# You should have received a copy of license in the LICENSE file. -# -# Authors: Rafael Carrascosa -# Gonzalo Garcia Berrotaran - -""" -Music related regex -""" - -from refo import Plus, Question -from quepy.dsl import HasKeyword -from quepy.parsing import Lemma, Lemmas, Pos, QuestionTemplate, Particle -from dsl import IsBand, LabelOf, IsMemberOf, ActiveYears, MusicGenreOf, \ - NameOf, IsAlbum, ProducedBy - - -class Band(Particle): - regex = Question(Pos("DT")) + Plus(Pos("NN") | Pos("NNP")) - - def interpret(self, match): - name = match.words.tokens.title() - return IsBand() + HasKeyword(name) - - -class BandMembersQuestion(QuestionTemplate): - """ - Regex for questions about band member. - Ex: "Radiohead members" - "What are the members of Metallica?" - """ - - regex1 = Band() + Lemma("member") - regex2 = Lemma("member") + Pos("IN") + Band() - regex3 = Pos("WP") + Lemma("be") + Pos("DT") + Lemma("member") + \ - Pos("IN") + Band() - - regex = (regex1 | regex2 | regex3) + Question(Pos(".")) - - def interpret(self, match): - member = IsMemberOf(match.band) - label = LabelOf(member) - return label, "enum" - - -class FoundationQuestion(QuestionTemplate): - """ - Regex for questions about the creation of a band. - Ex: "When was Pink Floyd founded?" - "When was Korn formed?" - """ - - regex = Pos("WRB") + Lemma("be") + Band() + \ - (Lemma("form") | Lemma("found")) + Question(Pos(".")) - - def interpret(self, match): - active_years = ActiveYears(match.band) - return active_years, "literal" - - -class GenreQuestion(QuestionTemplate): - """ - Regex for questions about the genre of a band. - Ex: "What is the music genre of Gorillaz?" - "Music genre of Radiohead" - """ - - optional_opening = Question(Pos("WP") + Lemma("be") + Pos("DT")) - regex = optional_opening + Question(Lemma("music")) + Lemma("genre") + \ - Pos("IN") + Band() + Question(Pos(".")) - - def interpret(self, match): - genre = MusicGenreOf(match.band) - label = LabelOf(genre) - return label, "enum" - - -class AlbumsOfQuestion(QuestionTemplate): - """ - Ex: "List albums of Pink Floyd" - "What albums did Pearl Jam record?" - "Albums by Metallica" - """ - - regex = (Question(Lemma("list")) + (Lemma("album") | Lemma("albums")) + \ - Pos("IN") + Band()) | \ - (Lemmas("what album do") + Band() + - (Lemma("record") | Lemma("make")) + Question(Pos("."))) | \ - (Lemma("list") + Band() + Lemma("album")) - - def interpret(self, match): - album = IsAlbum() + ProducedBy(match.band) - name = NameOf(album) - return name, "enum" diff --git a/examples/dbpedia/dbpedia/people.py.bak b/examples/dbpedia/dbpedia/people.py.bak deleted file mode 100644 index a7f263e..0000000 --- a/examples/dbpedia/dbpedia/people.py.bak +++ /dev/null @@ -1,66 +0,0 @@ -# coding: utf-8 - -# Copyright (c) 2012, Machinalis S.R.L. -# This file is part of quepy and is distributed under the Modified BSD License. -# You should have received a copy of license in the LICENSE file. -# -# Authors: Rafael Carrascosa -# Gonzalo Garcia Berrotaran - -""" -People related regex -""" - -from refo import Plus, Question -from quepy.dsl import HasKeyword -from quepy.parsing import Lemma, Lemmas, Pos, QuestionTemplate, Particle -from dsl import IsPerson, LabelOf, DefinitionOf, BirthDateOf, BirthPlaceOf - - -class Person(Particle): - regex = Plus(Pos("NN") | Pos("NNS") | Pos("NNP") | Pos("NNPS")) - - def interpret(self, match): - name = match.words.tokens - return IsPerson() + HasKeyword(name) - - -class WhoIs(QuestionTemplate): - """ - Ex: "Who is Tom Cruise?" - """ - - regex = Lemma("who") + Lemma("be") + Person() + \ - Question(Pos(".")) - - def interpret(self, match): - definition = DefinitionOf(match.person) - return definition, "define" - - -class HowOldIsQuestion(QuestionTemplate): - """ - Ex: "How old is Bob Dylan". - """ - - regex = Pos("WRB") + Lemma("old") + Lemma("be") + Person() + \ - Question(Pos(".")) - - def interpret(self, match): - birth_date = BirthDateOf(match.person) - return birth_date, "age" - - -class WhereIsFromQuestion(QuestionTemplate): - """ - Ex: "Where is Bill Gates from?" - """ - - regex = Lemmas("where be") + Person() + Lemma("from") + \ - Question(Pos(".")) - - def interpret(self, match): - birth_place = BirthPlaceOf(match.person) - label = LabelOf(birth_place) - - return label, "enum" diff --git a/examples/dbpedia/dbpedia/populated_place.py.bak b/examples/dbpedia/dbpedia/populated_place.py.bak deleted file mode 100644 index 3291c38..0000000 --- a/examples/dbpedia/dbpedia/populated_place.py.bak +++ /dev/null @@ -1,60 +0,0 @@ -# coding: utf-8 - -# Copyright (c) 2012, Machinalis S.R.L. -# This file is part of quepy and is distributed under the Modified BSD License. -# You should have received a copy of license in the LICENSE file. -# -# Authors: Rafael Carrascosa -# Gonzalo Garcia Berrotaran - -""" -Populated place related regex -""" - -from refo import Plus, Question -from quepy.dsl import HasKeyword -from quepy.parsing import Lemma, Pos, QuestionTemplate, Token, Particle -from dsl import IsPopulatedPlace, IncumbentOf, CapitalOf, \ - LabelOf, PopulationOf - - -class PopulatedPlace(Particle): - regex = Plus(Pos("DT") | Pos("NN") | Pos("NNS") | Pos("NNP") | Pos("NNPS")) - - def interpret(self, match): - name = match.words.tokens.title() - return IsPopulatedPlace() + HasKeyword(name) - - -class CapitalOfQuestion(QuestionTemplate): - """ - Regex for questions about the capital of a country. - Ex: "What is the capital of Massachussets?" - """ - - opening = Lemma("what") + Token("is") - regex = opening + Pos("DT") + Lemma("capital") + Pos("IN") + \ - Question(Pos("DT")) + PopulatedPlace() + Question(Pos(".")) - - def interpret(self, match): - capital = CapitalOf(match.populatedplace) - label = LabelOf(capital) - return label, "enum" - - -class PopulationOfQuestion(QuestionTemplate): - """ - Regex for questions about the population of a country. - Ex: "What is the population of Cordoba?" - "How many people live in Cordoba?" - """ - - openings = (Pos("WP") + Token("is") + Pos("DT") + - Lemma("population") + Pos("IN")) | \ - (Pos("WRB") + Lemma("many") + Lemma("people") + - Token("live") + Pos("IN")) - regex = openings + Question(Pos("DT")) + PopulatedPlace() + Question(Pos(".")) - - def interpret(self, match): - population = PopulationOf(match.populatedplace) - return population, "literal" diff --git a/examples/dbpedia/dbpedia/settings.py.bak b/examples/dbpedia/dbpedia/settings.py.bak deleted file mode 100644 index a75779b..0000000 --- a/examples/dbpedia/dbpedia/settings.py.bak +++ /dev/null @@ -1,34 +0,0 @@ -# coding: utf-8 - -# Copyright (c) 2012, Machinalis S.R.L. -# This file is part of quepy and is distributed under the Modified BSD License. -# You should have received a copy of license in the LICENSE file. -# -# Authors: Rafael Carrascosa -# Gonzalo Garcia Berrotaran - -""" -Settings. -""" - -# Generated query language -LANGUAGE = "sparql" - -# NLTK config -NLTK_DATA_PATH = [] # List of paths with NLTK data - -# Encoding config -DEFAULT_ENCODING = "utf-8" - -# Sparql config -SPARQL_PREAMBLE = u""" -PREFIX owl: -PREFIX rdfs: -PREFIX rdf: -PREFIX foaf: -PREFIX skos: -PREFIX quepy: -PREFIX dbpedia: -PREFIX dbpprop: -PREFIX dbpedia-owl: -""" diff --git a/examples/dbpedia/dbpedia/tvshows.py.bak b/examples/dbpedia/dbpedia/tvshows.py.bak deleted file mode 100644 index 16c6144..0000000 --- a/examples/dbpedia/dbpedia/tvshows.py.bak +++ /dev/null @@ -1,124 +0,0 @@ -# coding: utf-8 - -""" -Tv Shows related regex. -""" - -from refo import Plus, Question -from quepy.dsl import HasKeyword -from quepy.parsing import Lemma, Lemmas, Pos, QuestionTemplate, Particle -from dsl import IsTvShow, ReleaseDateOf, IsPerson, StarsIn, LabelOf, \ - HasShowName, NumberOfEpisodesIn, HasActor, ShowNameOf, CreatorOf - -nouns = Plus(Pos("NN") | Pos("NNS") | Pos("NNP") | Pos("NNPS")) - - -class TvShow(Particle): - regex = Plus(Question(Pos("DT")) + nouns) - - def interpret(self, match): - name = match.words.tokens - return IsTvShow() + HasShowName(name) - - -class Actor(Particle): - regex = nouns - - def interpret(self, match): - name = match.words.tokens - return IsPerson() + HasKeyword(name) - - -# FIXME: clash with movies release regex -class ReleaseDateQuestion(QuestionTemplate): - """ - Ex: when was Friends release? - """ - - regex = Lemmas("when be") + TvShow() + Lemma("release") + \ - Question(Pos(".")) - - def interpret(self, match): - release_date = ReleaseDateOf(match.tvshow) - return release_date, "literal" - - -class CastOfQuestion(QuestionTemplate): - """ - Ex: "What is the cast of Friends?" - "Who works in Breaking Bad?" - "List actors of Seinfeld" - """ - - regex = (Question(Lemmas("what be") + Pos("DT")) + - Lemma("cast") + Pos("IN") + TvShow() + Question(Pos("."))) | \ - (Lemmas("who works") + Pos("IN") + TvShow() + - Question(Pos("."))) | \ - (Lemmas("list actor") + Pos("IN") + TvShow()) - - def interpret(self, match): - actor = IsPerson() + StarsIn(match.tvshow) - name = LabelOf(actor) - return name, "enum" - - -class ListTvShows(QuestionTemplate): - """ - Ex: "List TV shows" - """ - - regex = Lemmas("list tv show") - - def interpret(self, match): - show = IsTvShow() - label = LabelOf(show) - return label, "enum" - - -class EpisodeCountQuestion(QuestionTemplate): - """ - Ex: "How many episodes does Seinfeld have?" - "Number of episodes of Seinfeld" - """ - - regex = ((Lemmas("how many episode do") + TvShow() + Lemma("have")) | - (Lemma("number") + Pos("IN") + Lemma("episode") + - Pos("IN") + TvShow())) + \ - Question(Pos(".")) - - def interpret(self, match): - number_of_episodes = NumberOfEpisodesIn(match.tvshow) - return number_of_episodes, "literal" - - -class ShowsWithQuestion(QuestionTemplate): - """ - Ex: "List shows with Hugh Laurie" - "In what shows does Jennifer Aniston appears?" - "Shows with Matt LeBlanc" - """ - - regex = (Lemmas("list show") + Pos("IN") + Actor()) | \ - (Pos("IN") + (Lemma("what") | Lemma("which")) + Lemmas("show do") + - Actor() + (Lemma("appear") | Lemma("work")) + - Question(Pos("."))) | \ - ((Lemma("show") | Lemma("shows")) + Pos("IN") + Actor()) - - def interpret(self, match): - show = IsTvShow() + HasActor(match.actor) - show_name = ShowNameOf(show) - return show_name, "enum" - - -class CreatorOfQuestion(QuestionTemplate): - """ - Ex: "Who is the creator of Breaking Bad?" - """ - - regex = Question(Lemmas("who be") + Pos("DT")) + \ - Lemma("creator") + Pos("IN") + TvShow() + Question(Pos(".")) - - def interpret(self, match): - creator = CreatorOf(match.tvshow) - label = LabelOf(creator) - return label, "enum" diff --git a/examples/dbpedia/dbpedia/writers.py.bak b/examples/dbpedia/dbpedia/writers.py.bak deleted file mode 100644 index 5affc14..0000000 --- a/examples/dbpedia/dbpedia/writers.py.bak +++ /dev/null @@ -1,69 +0,0 @@ -# coding: utf-8 - -# Copyright (c) 2012, Machinalis S.R.L. -# This file is part of quepy and is distributed under the Modified BSD License. -# You should have received a copy of license in the LICENSE file. -# -# Authors: Rafael Carrascosa -# Gonzalo Garcia Berrotaran - -""" -Writers related regex. -""" - - -from refo import Plus, Question -from quepy.dsl import HasKeyword -from quepy.parsing import Lemma, Lemmas, Pos, QuestionTemplate, Particle -from dsl import IsBook, HasAuthor, AuthorOf, IsPerson, NameOf - - -nouns = Pos("DT") | Pos("IN") | Pos("NN") | Pos("NNS") | Pos("NNP") | Pos("NNPS") - - -class Book(Particle): - regex = Plus(nouns) - - def interpret(self, match): - name = match.words.tokens - return IsBook() + HasKeyword(name) - - -class Author(Particle): - regex = Plus(nouns | Lemma(".")) - - def interpret(self, match): - name = match.words.tokens - return IsPerson() + HasKeyword(name) - - -class WhoWroteQuestion(QuestionTemplate): - """ - Ex: "who wrote The Little Prince?" - "who is the author of A Game Of Thrones?" - """ - - regex = ((Lemmas("who write") + Book()) | - (Question(Lemmas("who be") + Pos("DT")) + - Lemma("author") + Pos("IN") + Book())) + \ - Question(Pos(".")) - - def interpret(self, match): - author = NameOf(IsPerson() + AuthorOf(match.book)) - return author, "literal" - - -class BooksByAuthorQuestion(QuestionTemplate): - """ - Ex: "list books by George Orwell" - "which books did Suzanne Collins wrote?" - """ - - regex = (Question(Lemma("list")) + Lemmas("book by") + Author()) | \ - ((Lemma("which") | Lemma("what")) + Lemmas("book do") + - Author() + Lemma("write") + Question(Pos("."))) - - def interpret(self, match): - book = IsBook() + HasAuthor(match.author) - book_name = NameOf(book) - return book_name, "enum" diff --git a/examples/freebase/freebase/__init__.py.bak b/examples/freebase/freebase/__init__.py.bak deleted file mode 100644 index d3777a6..0000000 --- a/examples/freebase/freebase/__init__.py.bak +++ /dev/null @@ -1,14 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -""" -Init for freebase quepy. -""" - -from basic import * -from music import * -from people import * -from movies import * -from country import * -from tvshows import * -from writers import * \ No newline at end of file diff --git a/examples/freebase/freebase/basic.py.bak b/examples/freebase/freebase/basic.py.bak deleted file mode 100644 index 83b81b5..0000000 --- a/examples/freebase/freebase/basic.py.bak +++ /dev/null @@ -1,54 +0,0 @@ -# coding: utf-8 - -# Copyright (c) 2012, Machinalis S.R.L. -# This file is part of quepy and is distributed under the Modified BSD License. -# You should have received a copy of license in the LICENSE file. -# -# Authors: Rafael Carrascosa -# Gonzalo Garcia Berrotaran - -""" -Basic questions for Freebase. -""" - -from refo import Question, Plus -from dsl import DefinitionOf, NameOf, LocationOf -from quepy.dsl import HasKeyword -from quepy.parsing import QuestionTemplate, Particle, Lemma, Pos, Lemmas - - -class Thing(Particle): - regex = Plus(Question(Pos("JJ")) + (Pos("NN") | Pos("NNP") | Pos("NNS")) | - Pos("VBN")) - - def interpret(self, match): - return HasKeyword(match.words.tokens) - - -class WhatIs(QuestionTemplate): - """ - Regex for questions like "What is a blowtorch - Ex: "What is a car" - "What is Seinfield?" - """ - - regex = Lemma("what") + Lemma("be") + Question(Pos("DT")) + \ - Thing() + Question(Pos(".")) - - def interpret(self, match): - label = DefinitionOf(match.thing) - return label - - -class WhereIsQuestion(QuestionTemplate): - """ - Ex: "where in the world is the Eiffel Tower" - """ - - regex = Lemma("where") + Question(Lemmas("in the world")) + Lemma("be") + \ - Question(Pos("DT")) + Thing() + Question(Pos(".")) - - def interpret(self, match): - location = LocationOf(match.thing) - location_name = NameOf(location) - return location_name diff --git a/examples/freebase/freebase/country.py.bak b/examples/freebase/freebase/country.py.bak deleted file mode 100644 index dcdea15..0000000 --- a/examples/freebase/freebase/country.py.bak +++ /dev/null @@ -1,94 +0,0 @@ -# coding: utf-8 - -# Copyright (c) 2012, Machinalis S.R.L. -# This file is part of quepy and is distributed under the Modified BSD License. -# You should have received a copy of license in the LICENSE file. -# -# Authors: Rafael Carrascosa -# Gonzalo Garcia Berrotaran - -""" -Coutry related regex -""" - -from dsl import * -from refo import Plus, Question -from quepy.dsl import HasKeyword -from quepy.parsing import Lemma, Pos, QuestionTemplate, Token, Particle - - -class Country(Particle): - regex = Plus(Pos("DT") | Pos("NN") | Pos("NNS") | Pos("NNP") | Pos("NNPS")) - - def interpret(self, match): - name = match.words.tokens.title() - return IsCountry() + HasKeyword(name) - - -class PresidentOfQuestion(QuestionTemplate): - """ - Ex: "list presidents of Argentina?" - """ - - regex = Question(Lemma("list")) + Lemma("president") + Pos("IN") + \ - Country() + Question(Pos(".")) - - def interpret(self, match): - president = IsPresident() + PresidentOf(match.country) - name = NameOf(OfficeHolderOf(president)) - return name - - -class CapitalOfQuestion(QuestionTemplate): - """ - Regex for questions about the capital of a country. - Ex: "What is the capital of Bolivia?" - """ - - opening = Lemma("what") + Token("is") - regex = opening + Pos("DT") + Lemma("capital") + Pos("IN") + \ - Question(Pos("DT")) + Country() + Question(Pos(".")) - - def interpret(self, match): - capital = CapitalOf(match.country) - label = NameOf(capital) - return label - - -class LanguageOfQuestion(QuestionTemplate): - """ - Regex for questions about the language spoken in a country. - Ex: "What is the language of Argentina?" - "what language is spoken in Argentina?" - """ - - openings = (Lemma("what") + Token("is") + Pos("DT") + - Question(Lemma("official")) + Lemma("language")) | \ - (Lemma("what") + Lemma("language") + Token("is") + - Lemma("speak")) - - regex = openings + Pos("IN") + Question(Pos("DT")) + Country() + \ - Question(Pos(".")) - - def interpret(self, match): - language = LanguageOf(match.country) - name = NameOf(language) - return name - - -class PopulationOfQuestion(QuestionTemplate): - """ - Regex for questions about the population of a country. - Ex: "What is the population of China?" - "How many people live in China?" - """ - - openings = (Pos("WP") + Token("is") + Pos("DT") + - Lemma("population") + Pos("IN")) | \ - (Pos("WRB") + Lemma("many") + Lemma("people") + - Token("live") + Pos("IN")) - regex = openings + Question(Pos("DT")) + Country() + Question(Pos(".")) - - def interpret(self, match): - population = NumberOf(PopulationOf(match.country)) - return population diff --git a/examples/freebase/freebase/movies.py.bak b/examples/freebase/freebase/movies.py.bak deleted file mode 100644 index 38837ef..0000000 --- a/examples/freebase/freebase/movies.py.bak +++ /dev/null @@ -1,184 +0,0 @@ -# coding: utf-8 - -# Copyright (c) 2012, Machinalis S.R.L. -# This file is part of quepy and is distributed under the Modified BSD License. -# You should have received a copy of license in the LICENSE file. -# -# Authors: Rafael Carrascosa -# Gonzalo Garcia Berrotaran - -""" -Movie related regex. -""" - -from refo import Plus, Question -from quepy.dsl import HasKeyword -from quepy.parsing import Lemma, Lemmas, Pos, QuestionTemplate, Particle -from dsl import * - -nouns = Plus(Pos("NN") | Pos("NNS") | Pos("NNP") | Pos("NNPS")) - - -class Movie(Particle): - regex = Question(Pos("DT")) + nouns - - def interpret(self, match): - name = match.words.tokens - return IsMovie() + HasName(name) - - -class Actor(Particle): - regex = nouns - - def interpret(self, match): - name = match.words.tokens - return IsPerson() + IsActor() + HasKeyword(name) - - -class Director(Particle): - regex = nouns - - def interpret(self, match): - name = match.words.tokens - return IsPerson() + IsDirector() + HasKeyword(name) - - -class ListMoviesQuestion(QuestionTemplate): - """ - Ex: "list movies" - """ - - regex = Lemma("list") + (Lemma("movie") | Lemma("film")) - - def interpret(self, match): - movie = IsMovie() - name = NameOf(movie) - return name - - -class MoviesByDirectorQuestion(QuestionTemplate): - """ - Ex: "List movies directed by Quentin Tarantino. - "movies directed by Martin Scorsese" - "which movies did Mel Gibson directed" - """ - - regex = (Question(Lemma("list")) + (Lemma("movie") | Lemma("film")) + - Question(Lemma("direct")) + Lemma("by") + Director()) | \ - (Lemma("which") + (Lemma("movie") | Lemma("film")) + Lemma("do") + - Director() + Lemma("direct") + Question(Pos("."))) - - def interpret(self, match): - movie = IsMovie() + DirectedBy(match.director) - movie_name = NameOf(movie) - return movie_name - - -class MovieDurationQuestion(QuestionTemplate): - """ - Ex: "How long is Pulp Fiction" - "What is the duration of The Thin Red Line?" - """ - - regex = ((Lemmas("how long be") + Movie()) | - (Lemmas("what be") + Pos("DT") + Lemma("duration") + - Pos("IN") + Movie())) + \ - Question(Pos(".")) - - def interpret(self, match): - duration = DurationOf(RuntimeOf(match.movie)) - return duration - - -class ActedOnQuestion(QuestionTemplate): - """ - Ex: "List movies with Hugh Laurie" - "Movies with Matt LeBlanc" - "In what movies did Jennifer Aniston appear?" - "Which movies did Mel Gibson starred?" - "Movies starring Winona Ryder" - """ - - acted_on = (Lemma("appear") | Lemma("act") | Lemma("star")) - movie = (Lemma("movie") | Lemma("movies") | Lemma("film")) - regex = (Question(Lemma("list")) + movie + Lemma("with") + Actor()) | \ - (Question(Pos("IN")) + (Lemma("what") | Lemma("which")) + - movie + Lemma("do") + Actor() + acted_on + Question(Pos("."))) | \ - (Question(Pos("IN")) + Lemma("which") + movie + Lemma("do") + - Actor() + acted_on) | \ - (Question(Lemma("list")) + movie + Lemma("star") + Actor()) - - def interpret(self, match): - performance = IsPerformance() + PerformanceOfActor(match.actor) - movie = IsMovie() + HasPerformance(performance) - movie_name = NameOf(movie) - return movie_name - - -class MovieReleaseDateQuestion(QuestionTemplate): - """ - Ex: "When was The Red Thin Line released?" - "Release date of The Empire Strikes Back" - """ - - regex = ((Lemmas("when be") + Movie() + Lemma("release")) | - (Lemma("release") + Question(Lemma("date")) + - Pos("IN") + Movie())) + \ - Question(Pos(".")) - - def interpret(self, match): - release_date = ReleaseDateOf(match.movie) - return release_date - - -class DirectorOfQuestion(QuestionTemplate): - """ - Ex: "Who is the director of Big Fish?" - "who directed Pocahontas?" - """ - - regex = ((Lemmas("who be") + Pos("DT") + Lemma("director") + - Pos("IN") + Movie()) | - (Lemma("who") + Lemma("direct") + Movie())) + \ - Question(Pos(".")) - - def interpret(self, match): - director = IsPerson() + DirectorOf(match.movie) - director_name = NameOf(director) - return director_name - - -class ActorsOfQuestion(QuestionTemplate): - """ - Ex: "who are the actors of Titanic?" - "who acted in Alien?" - "who starred in Depredator?" - "Actors of Fight Club" - """ - - regex = (Lemma("who") + Question(Lemma("be") + Pos("DT")) + - (Lemma("act") | Lemma("actor") | Lemma("star")) + - Pos("IN") + Movie() + Question(Pos("."))) | \ - ((Lemma("actors") | Lemma("actor")) + Pos("IN") + Movie()) - - def interpret(self, match): - performance = IsPerformance() + PerformanceOfMovie(match.movie) - actor = IsActor() + PerformsIn(performance) - name = NameOf(actor) - return name - - -class PlotOfQuestion(QuestionTemplate): - """ - Ex: "what is Shame about?" - "plot of Titanic" - """ - - regex = ((Lemmas("what be") + Movie() + Lemma("about")) | \ - (Question(Lemmas("what be the")) + Lemma("plot") + - Pos("IN") + Movie())) + \ - Question(Pos(".")) - - def interpret(self, match): - definition = DefinitionOf(match.movie) - return definition diff --git a/examples/freebase/freebase/music.py.bak b/examples/freebase/freebase/music.py.bak deleted file mode 100644 index f38d01a..0000000 --- a/examples/freebase/freebase/music.py.bak +++ /dev/null @@ -1,97 +0,0 @@ -# coding: utf-8 - -# Copyright (c) 2012, Machinalis S.R.L. -# This file is part of quepy and is distributed under the Modified BSD License. -# You should have received a copy of license in the LICENSE file. -# -# Authors: Rafael Carrascosa -# Gonzalo Garcia Berrotaran - -""" -Music related regex -""" - -from dsl import * -from refo import Plus, Question -from quepy.dsl import HasKeyword -from quepy.parsing import Lemma, Lemmas, Pos, QuestionTemplate, Particle - - -class Band(Particle): - regex = Question(Pos("DT")) + Plus(Pos("NN") | Pos("NNP")) - - def interpret(self, match): - name = match.words.tokens.title() - return IsBand() + HasKeyword(name) - - -class BandMembersQuestion(QuestionTemplate): - """ - Regex for questions about band member. - Ex: "Radiohead members" - "What are the members of Metallica?" - """ - - regex1 = Band() + Lemma("member") - regex2 = Lemma("member") + Pos("IN") + Band() - regex3 = Pos("WP") + Lemma("be") + Pos("DT") + Lemma("member") + \ - Pos("IN") + Band() - - regex = (regex1 | regex2 | regex3) + Question(Pos(".")) - - def interpret(self, match): - group = GroupOf(match.band) - member = IsPerson() + IsMusicArtist() + IsMemberOf(group) - name = NameOf(member) - return name - - -class FoundationQuestion(QuestionTemplate): - """ - Regex for questions about the creation of a band. - Ex: "When was Pink Floyd founded?" - "When was Korn formed?" - """ - - regex = Pos("WRB") + Lemma("be") + Band() + \ - (Lemma("form") | Lemma("found")) + Question(Pos(".")) - - def interpret(self, match): - active_years = ActiveYearsOf(match.band) - return active_years - - -class GenreQuestion(QuestionTemplate): - """ - Regex for questions about the genre of a band. - Ex: "What is the music genre of Gorillaz?" - "Music genre of Radiohead" - """ - - optional_opening = Question(Pos("WP") + Lemma("be") + Pos("DT")) - regex = optional_opening + Question(Lemma("music")) + Lemma("genre") + \ - Pos("IN") + Band() + Question(Pos(".")) - - def interpret(self, match): - genre = MusicGenreOf(match.band) - name = NameOf(genre) - return name - - -class AlbumsOfQuestion(QuestionTemplate): - """ - Ex: "List albums of Pink Floyd" - "What albums did Pearl Jam record?" - "Albums by Metallica" - """ - - regex = (Question(Lemma("list")) + (Lemma("album") | Lemma("albums")) + \ - Pos("IN") + Band()) | \ - (Lemmas("what album do") + Band() + - (Lemma("record") | Lemma("make")) + Question(Pos("."))) | \ - (Lemma("list") + Band() + Lemma("album")) - - def interpret(self, match): - album = IsAlbum() + ProducedBy(match.band) - name = NameOf(album) - return name diff --git a/examples/freebase/freebase/people.py.bak b/examples/freebase/freebase/people.py.bak deleted file mode 100644 index 045e43a..0000000 --- a/examples/freebase/freebase/people.py.bak +++ /dev/null @@ -1,65 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright (c) 2012, Machinalis S.R.L. -# This file is part of quepy and is distributed under the Modified BSD License. -# You should have received a copy of license in the LICENSE file. -# -# Authors: Rafael Carrascosa -# Gonzalo Garcia Berrotaran - -""" -People related regex -""" - -from dsl import * -from refo import Plus, Question -from quepy.dsl import HasKeyword -from quepy.parsing import Lemma, Lemmas, Pos, QuestionTemplate, Particle - - -class Person(Particle): - regex = Plus(Pos("NN") | Pos("NNS") | Pos("NNP") | Pos("NNPS")) - - def interpret(self, match): - name = match.words.tokens - return IsPerson() + HasKeyword(name) - - -class WhoIs(QuestionTemplate): - """ - Ex: "Who is Tom Cruise?" - """ - - regex = Lemma("who") + Lemma("be") + Person() + \ - Question(Pos(".")) - - def interpret(self, match): - definition = DefinitionOf(match.person) - return definition - - -class HowOldIsQuestion(QuestionTemplate): - """ - Ex: "How old is Bob Dylan". - """ - - regex = Pos("WRB") + Lemma("old") + Lemma("be") + Person() + \ - Question(Pos(".")) - - def interpret(self, match): - birth_date = BirthDateOf(match.person) - return birth_date - - -class WhereIsFromQuestion(QuestionTemplate): - """ - Ex: "Where is Bill Gates from?" - """ - - regex = Lemmas("where be") + Person() + Lemma("from") + \ - Question(Pos(".")) - - def interpret(self, match): - birth_place = BirthPlaceOf(match.person) - name = NameOf(birth_place) - return name diff --git a/examples/freebase/freebase/tvshows.py.bak b/examples/freebase/freebase/tvshows.py.bak deleted file mode 100644 index 1f7f096..0000000 --- a/examples/freebase/freebase/tvshows.py.bak +++ /dev/null @@ -1,112 +0,0 @@ -# coding: utf-8 - -""" -Tv Shows related regex. -""" - -from dsl import * -from refo import Plus, Question -from quepy.dsl import HasKeyword -from quepy.parsing import Lemma, Lemmas, Pos, QuestionTemplate, Particle - -nouns = Plus(Pos("NN") | Pos("NNS") | Pos("NNP") | Pos("NNPS")) - - -class TvShow(Particle): - regex = Plus(Question(Pos("DT")) + nouns) - - def interpret(self, match): - name = match.words.tokens - return IsTvShow() + HasName(name) - - -class Actor(Particle): - regex = nouns - - def interpret(self, match): - name = match.words.tokens - return IsPerson() + HasName(name) - - -class CastOfQuestion(QuestionTemplate): - """ - Ex: "What is the cast of Friends?" - "Who works in Breaking Bad?" - "List actors of Seinfeld" - """ - - regex = (Question(Lemmas("what be") + Pos("DT")) + - Lemma("cast") + Pos("IN") + TvShow() + Question(Pos("."))) | \ - (Lemmas("who works") + Pos("IN") + TvShow() + - Question(Pos("."))) | \ - (Lemmas("list actor") + Pos("IN") + TvShow()) - - def interpret(self, match): - cast = CastOf(match.tvshow) - actor = IsPerson() + IsActorOf(cast) - name = NameOf(actor) - return name - - -class ListTvShows(QuestionTemplate): - """ - Ex: "List TV shows" - """ - - regex = Lemmas("list tv show") - - def interpret(self, match): - show = IsTvShow() - label = NameOf(show) - return label - - -class EpisodeCountQuestion(QuestionTemplate): - """ - Ex: "How many episodes does Seinfeld have?" - "Number of episodes of Seinfeld" - """ - - regex = ((Lemmas("how many episode do") + TvShow() + Lemma("have")) | - (Lemma("number") + Pos("IN") + Lemma("episode") + - Pos("IN") + TvShow())) + \ - Question(Pos(".")) - - def interpret(self, match): - number_of_episodes = NumberOfEpisodesIn(match.tvshow) - return number_of_episodes - - -class ShowsWithQuestion(QuestionTemplate): - """ - Ex: "List shows with Hugh Laurie" - "In what shows does Jennifer Aniston appears?" - "Shows with Matt LeBlanc" - """ - - regex = (Lemmas("list show") + Pos("IN") + Actor()) | \ - (Pos("IN") + (Lemma("what") | Lemma("which")) + Lemmas("show do") + - Actor() + (Lemma("appear") | Lemma("work")) + - Question(Pos("."))) | \ - ((Lemma("show") | Lemma("shows")) + Pos("IN") + Actor()) - - def interpret(self, match): - cast = HasActor(match.actor) - show = IsTvShow() + HasCast(cast) - show_name = NameOf(show) - return show_name - - -class CreatorOfQuestion(QuestionTemplate): - """ - Ex: "Who is the creator of Breaking Bad?" - "Who are the creators of Friends?" - """ - - regex = Question(Lemmas("who be") + Pos("DT")) + \ - Lemma("creator") + Pos("IN") + TvShow() + Question(Pos(".")) - - def interpret(self, match): - creator = CreatorOf(match.tvshow) - name = NameOf(creator) - return name diff --git a/examples/freebase/freebase/writers.py.bak b/examples/freebase/freebase/writers.py.bak deleted file mode 100644 index f66e330..0000000 --- a/examples/freebase/freebase/writers.py.bak +++ /dev/null @@ -1,69 +0,0 @@ -# coding: utf-8 - -# Copyright (c) 2012, Machinalis S.R.L. -# This file is part of quepy and is distributed under the Modified BSD License. -# You should have received a copy of license in the LICENSE file. -# -# Authors: Rafael Carrascosa -# Gonzalo Garcia Berrotaran - -""" -Writers related regex. -""" - - -from dsl import * -from refo import Plus, Question -from quepy.dsl import HasKeyword -from quepy.parsing import Lemma, Lemmas, Pos, QuestionTemplate, Particle - - -nouns = Pos("DT") | Pos("IN") | Pos("NN") | Pos("NNS") | Pos("NNP") | Pos("NNPS") - - -class Book(Particle): - regex = Plus(nouns) - - def interpret(self, match): - name = match.words.tokens - return IsBook() + HasKeyword(name) - - -class Author(Particle): - regex = Plus(nouns | Lemma(".")) - - def interpret(self, match): - name = match.words.tokens - return IsPerson() + HasKeyword(name) - - -class WhoWroteQuestion(QuestionTemplate): - """ - Ex: "who wrote The Little Prince?" - "who is the author of A Game Of Thrones?" - """ - - regex = ((Lemmas("who write") + Book()) | - (Question(Lemmas("who be") + Pos("DT")) + - Lemma("author") + Pos("IN") + Book())) + \ - Question(Pos(".")) - - def interpret(self, match): - author = NameOf(IsPerson() + AuthorOf(match.book)) - return author - - -class BooksByAuthorQuestion(QuestionTemplate): - """ - Ex: "list books by George Orwell" - "which books did Suzanne Collins wrote?" - """ - - regex = (Question(Lemma("list")) + Lemmas("book by") + Author()) | \ - ((Lemma("which") | Lemma("what")) + Lemmas("book do") + - Author() + Lemma("write") + Question(Pos("."))) - - def interpret(self, match): - book = IsBook() + HasAuthor(match.author) - book_name = NameOf(book) - return book_name From 381d56ea5f41adda53c44bd2cd9e2f1c99beca9b Mon Sep 17 00:00:00 2001 From: Bowen Fu Date: Tue, 22 Aug 2017 16:00:02 +0800 Subject: [PATCH 08/10] cannot resolve test error --- examples/dbpedia/main.py | 7 +++++-- tests/test_dot_generation.py | 15 ++++++--------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/examples/dbpedia/main.py b/examples/dbpedia/main.py index 86b2ccf..fd05a20 100644 --- a/examples/dbpedia/main.py +++ b/examples/dbpedia/main.py @@ -139,11 +139,13 @@ def wikipedia2dbpedia(wikipedia_url): if __name__ == "__main__": default_questions = [ - "What is a car?", + "谁是奥巴马?", + "What is a car?",] + """ "Who is Tom Cruise?", "Who is George Lucas?", "Who is Mirtha Legrand?", - # "List Microsoft software", + "List Microsoft software", "Name Fiat cars", "time in argentina", "what time is it in Chile?", @@ -154,6 +156,7 @@ def wikipedia2dbpedia(wikipedia_url): "who directed Pocahontas?", "actors of Fight Club", ] + """ if "-d" in sys.argv: quepy.set_loglevel("DEBUG") diff --git a/tests/test_dot_generation.py b/tests/test_dot_generation.py index c26db1b..8d8ccef 100644 --- a/tests/test_dot_generation.py +++ b/tests/test_dot_generation.py @@ -63,24 +63,21 @@ def test_dot_takes_fails_ascii2(self): def test_dot_stress(self): seed("I have come here to chew bubblegum and kick ass... " "and I'm all out of bubblegum.") - dot_file = tempfile.NamedTemporaryFile() - cmdline = "dot %s" % dot_file.name + dot_file_name = 'test_temp_file' + cmdline = "dot %s" % dot_file_name msg = "dot returned error code {}, check {} input file." for _ in range(100): expression = random_expression() _, dot_string = expression_to_dot(expression) - with open(dot_file.name, "w") as filehandler: - filehandler.write(dot_string.encode("utf-8")) + with open(dot_file_name, "w") as filehandler: + filehandler.write(dot_string) try: - retcode = subprocess.call(cmdline.split(), - stdout=tempfile.TemporaryFile()) + retcode = subprocess.call(cmdline.split()) except OSError: print("Warning: the program 'dot' was not found, tests skipped") return - if retcode != 0: - dot_file.delete = False - self.assertEqual(retcode, 0, msg.format(retcode, dot_file.name)) + self.assertEqual(retcode, 0, msg.format(retcode, dot_file_name)) if __name__ == "__main__": From 2f0498457ec1f3a49425f1207d717acf99dd13cf Mon Sep 17 00:00:00 2001 From: Bowen Fu Date: Tue, 22 Aug 2017 16:37:30 +0800 Subject: [PATCH 09/10] remove encoding --- quepy/dot_generation.py | 2 -- quepy/dsl.py | 6 ------ quepy/encodingpolicy.py | 43 -------------------------------------- quepy/mql_generation.py | 3 --- quepy/nltktagger.py | 3 --- quepy/parsing.py | 3 --- quepy/quepyapp.py | 5 +---- quepy/sparql_generation.py | 2 -- quepy/tagger.py | 6 ++---- 9 files changed, 3 insertions(+), 70 deletions(-) delete mode 100644 quepy/encodingpolicy.py diff --git a/quepy/dot_generation.py b/quepy/dot_generation.py index ab728d3..41eda2b 100644 --- a/quepy/dot_generation.py +++ b/quepy/dot_generation.py @@ -7,7 +7,6 @@ import random from quepy.expression import isnode from quepy.dsl import IsRelatedTo, HasKeyword -from quepy.encodingpolicy import assert_valid_encoding def escape(x, add_quotes=True): @@ -30,7 +29,6 @@ def adapt(x): x = "x{}".format(x) return x if isinstance(x, str): - assert_valid_encoding(x) x = escape(x) if x.startswith("\""): return x diff --git a/quepy/dsl.py b/quepy/dsl.py index fc66b71..101bda3 100644 --- a/quepy/dsl.py +++ b/quepy/dsl.py @@ -14,7 +14,6 @@ from copy import copy from quepy.expression import Expression -from quepy.encodingpolicy import encoding_flexible_conversion class FixedRelation(Expression): @@ -52,9 +51,6 @@ def __init__(self): if self.fixedtype is None: raise ValueError("You *must* define the `fixedtype` " "class attribute to use this class.") - self.fixedtype = encoding_flexible_conversion(self.fixedtype) - self.fixedtyperelation = \ - encoding_flexible_conversion(self.fixedtyperelation) self.add_data(self.fixedtyperelation, self.fixedtype) @@ -72,9 +68,7 @@ def __init__(self, data): if self.relation is None: raise ValueError("You *must* define the `relation` " "class attribute to use this class.") - self.relation = encoding_flexible_conversion(self.relation) if self.language is not None: - self.language = encoding_flexible_conversion(self.language) data = "\"{0}\"@{1}".format(data, self.language) self.add_data(self.relation, data) diff --git a/quepy/encodingpolicy.py b/quepy/encodingpolicy.py deleted file mode 100644 index 4328b27..0000000 --- a/quepy/encodingpolicy.py +++ /dev/null @@ -1,43 +0,0 @@ -# coding: utf-8 - -# Copyright (c) 2012, Machinalis S.R.L. -# This file is part of quepy and is distributed under the Modified BSD License. -# You should have received a copy of license in the LICENSE file. -# -# Authors: Rafael Carrascosa -# Gonzalo Garcia Berrotaran - -""" -Functions to do encoding checkings. -""" - -import logging -from quepy import settings -logger = logging.getLogger("quepy.encodingpolicy") - - -def encoding_flexible_conversion(string, complain=False): - """ - Converts string to the proper encoding if it's possible - and if it's not raises a ValueError exception. - - If complain it's True, it will emit a logging warning about - converting a string that had to be on the right encoding. - """ - - if isinstance(string, str): - return string - if complain: - logger.warning("Forced to guess the encoding of {!r}, please " - "provide a unicode string instead".format(string)) - return ustring - - -def assert_valid_encoding(string): - """ - If string it's not in a valid encoding it raises a - ValueError exception. - """ - - if not isinstance(string, str): - raise ValueError("Argument must be unicode") diff --git a/quepy/mql_generation.py b/quepy/mql_generation.py index fc39b68..49e87a5 100644 --- a/quepy/mql_generation.py +++ b/quepy/mql_generation.py @@ -4,7 +4,6 @@ import json from quepy.dsl import IsRelatedTo from quepy.expression import isnode -from quepy.encodingpolicy import encoding_flexible_conversion def choose_start_node(e): @@ -27,8 +26,6 @@ def safely_to_unicode(x): """ if isinstance(x, str): return x - if isinstance(x, str): - return encoding_flexible_conversion(x) if isinstance(x, IsRelatedTo): return "/type/reflect/any_master" return str(x) # FIXME: Any object is unicode-able, this is error prone diff --git a/quepy/nltktagger.py b/quepy/nltktagger.py index 582e58c..a36543e 100644 --- a/quepy/nltktagger.py +++ b/quepy/nltktagger.py @@ -17,13 +17,11 @@ import nltk from quepy.tagger import Word -from quepy.encodingpolicy import assert_valid_encoding _penn_to_morphy_tag = {} def penn_to_morphy_tag(tag): - assert_valid_encoding(tag) for penn, morphy in _penn_to_morphy_tag.items(): if tag.startswith(penn): @@ -36,7 +34,6 @@ def run_nltktagger(string, nltk_data_path=None): Runs nltk tagger on `string` and returns a list of :class:`quepy.tagger.Word` objects. """ - assert_valid_encoding(string) global _penn_to_morphy_tag if nltk_data_path: diff --git a/quepy/parsing.py b/quepy/parsing.py index e53faf3..0429126 100644 --- a/quepy/parsing.py +++ b/quepy/parsing.py @@ -11,7 +11,6 @@ import logging from refo import Predicate, Literal, Star, Any, Group -from quepy.encodingpolicy import encoding_flexible_conversion _EOL = None logger = logging.getLogger("quepy.parsing") @@ -136,7 +135,6 @@ class Pos(Predicate): """ def __init__(self, tag): - tag = encoding_flexible_conversion(tag) self.tag = tag super(Pos, self).__init__(self._predicate) self.arg = tag @@ -196,7 +194,6 @@ def __repr__(self): def _predicate_sum_from_string(string, predicate): assert issubclass(predicate, Predicate) - string = encoding_flexible_conversion(string) words = string.split() result = None for word in words: diff --git a/quepy/quepyapp.py b/quepy/quepyapp.py index 415c485..4946928 100644 --- a/quepy/quepyapp.py +++ b/quepy/quepyapp.py @@ -19,7 +19,7 @@ from quepy import generation from quepy.parsing import QuestionTemplate from quepy.tagger import get_tagger, TaggingError -from quepy.encodingpolicy import encoding_flexible_conversion +from quepy.cntagger import get_tagger, TaggingError logger = logging.getLogger("quepy.quepyapp") @@ -119,7 +119,6 @@ def get_queries(self, question): The queries returned corresponds to the regexes that match in weight order. """ - question = encoding_flexible_conversion(question) for expression, userdata in self._iter_compiled_forms(question): target, query = generation.get_code(expression, self.language) message = "Interpretation {1}: {0}" @@ -157,6 +156,4 @@ def _save_settings_values(self): for key in dir(self._settings_module): if key.upper() == key: value = getattr(self._settings_module, key) - if isinstance(value, str): - value = encoding_flexible_conversion(value) setattr(settings, key, value) diff --git a/quepy/sparql_generation.py b/quepy/sparql_generation.py index a7403a6..a0da704 100644 --- a/quepy/sparql_generation.py +++ b/quepy/sparql_generation.py @@ -7,7 +7,6 @@ from quepy import settings from quepy.dsl import IsRelatedTo from quepy.expression import isnode -from quepy.encodingpolicy import assert_valid_encoding _indent = " " @@ -30,7 +29,6 @@ def adapt(x): x = "?x{}".format(x) return x if isinstance(x, str): - assert_valid_encoding(x) if x.startswith("\"") or ":" in x: return x return '"{}"'.format(x) diff --git a/quepy/tagger.py b/quepy/tagger.py index ff99758..16bb339 100644 --- a/quepy/tagger.py +++ b/quepy/tagger.py @@ -10,7 +10,6 @@ import logging from quepy import settings -from quepy.encodingpolicy import assert_valid_encoding logger = logging.getLogger("quepy.tagger") PENN_TAGSET = set("$ `` '' ( ) , -- . : CC CD DT EX FW IN JJ JJR JJS LS MD " @@ -31,8 +30,10 @@ class Word(object): Contains *token*, *lemma*, *pos tag* and optionally a *probability* of that tag. """ + """ _encoding_attrs = "token lemma pos".split() _attrs = _encoding_attrs + ["prob"] + """ def __init__(self, token, lemma=None, pos=None, prob=None): self.pos = pos @@ -41,8 +42,6 @@ def __init__(self, token, lemma=None, pos=None, prob=None): self.token = token def __setattr__(self, name, value): - if name in self._encoding_attrs and value is not None: - assert_valid_encoding(value) object.__setattr__(self, name, value) def __unicode__(self): @@ -64,7 +63,6 @@ def get_tagger(): tagger_function = lambda x: run_nltktagger(x, settings.NLTK_DATA_PATH) def wrapper(string): - assert_valid_encoding(string) words = tagger_function(string) for word in words: if word.pos not in PENN_TAGSET: From 0753ae829478dd34d8a235dacc7b5604a8550d75 Mon Sep 17 00:00:00 2001 From: Bowen Fu Date: Tue, 22 Aug 2017 18:11:09 +0800 Subject: [PATCH 10/10] remove penn for cntagger --- examples/dbpedia/main.py | 4 ++-- quepy/quepyapp.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/dbpedia/main.py b/examples/dbpedia/main.py index fd05a20..d8ac825 100644 --- a/examples/dbpedia/main.py +++ b/examples/dbpedia/main.py @@ -139,9 +139,9 @@ def wikipedia2dbpedia(wikipedia_url): if __name__ == "__main__": default_questions = [ - "谁是奥巴马?", - "What is a car?",] + "谁是奥巴马?",] """ + "What is a car?",] "Who is Tom Cruise?", "Who is George Lucas?", "Who is Mirtha Legrand?", diff --git a/quepy/quepyapp.py b/quepy/quepyapp.py index 4946928..ea9431d 100644 --- a/quepy/quepyapp.py +++ b/quepy/quepyapp.py @@ -18,7 +18,7 @@ from quepy import settings from quepy import generation from quepy.parsing import QuestionTemplate -from quepy.tagger import get_tagger, TaggingError +#from quepy.tagger import get_tagger, TaggingError from quepy.cntagger import get_tagger, TaggingError logger = logging.getLogger("quepy.quepyapp")