From 48181c79ea6fb9579604751e8ec7ab6cca2c987a Mon Sep 17 00:00:00 2001
From: odanoburu <bcclaro@gmail.com>
Date: Fri, 30 Aug 2019 17:32:52 -0300
Subject: [PATCH] * relations.tsv

- use commas for pos column
- add satellite adjectives PoS

* validation

- check that a relation has the correct domain (either synset or
  wordsense of the appropriate PoS)

obs: maybe this is too much to check in the parser phase and we should
defer it to validation? (the parser is not so easy to read now)

related to #6
---
 relations.tsv | 34 +++++++++++++++++-----------------
 src/Data.hs   | 21 +++++++++++++++++++++
 src/Lib.hs    | 36 ++++++++++++++++++++++++------------
 src/Parse.hs  | 38 +++++++++++++++++++++++++-------------
 4 files changed, 87 insertions(+), 42 deletions(-)

diff --git a/relations.tsv b/relations.tsv
index f8f1470..ae7c4ae 100644
--- a/relations.tsv
+++ b/relations.tsv
@@ -1,5 +1,5 @@
 name	lexicographer file	text file	rdf	pos	domain	description
-antonymOf	!	ant	wn30/own-pt/schema/antonymOf	nvar	word	Antonym
+antonymOf	!	ant	wn30/own-pt/schema/antonymOf	n,v,a,r	word	Antonym
 memberHolonymOf	#m	hm	wn30/own-pt/schema/memberHolonymOf	n	synset	Member holonym
 partHolonymOf	#p	hp	wn30/own-pt/schema/partHolonymOf	n	synset	Part holonym
 substanceHolonymOf	#s	hs	wn30/own-pt/schema/substanceHolonymOf	n	synset	Substance holonym
@@ -7,28 +7,28 @@ sameVerbGroupAs	$	vg	wn30/own-pt/schema/sameVerbGroupAs	v	synset,word	Verb Group
 memberMeronymOf	%m	mm	wn30/own-pt/schema/memberMeronymOf	n	synset	Member meronym
 partMeronymOf	%p	mp	wn30/own-pt/schema/partMeronymOf	n	synset	Part meronym
 substanceMeronymOf	%s	ms	wn30/own-pt/schema/substanceMeronymOf	n	synset	Substance meronym
-similarTo	&	sim	wn30/own-pt/schema/similarTo	a	synset	Similar to
+similarTo	&	sim	wn30/own-pt/schema/similarTo	a,s	synset	Similar to
 entails	*	entail	wn30/own-pt/schema/entails	v	synset	entailment
-derivationallyRelated	+	drf	wn30/own-pt/schema/derivationallyRelated	nv	synset,word	Derivationally related form
-classifiedByTopic	-c	mt	wn30/own-pt/schema/classifiedByTopic	nvar	synset,word	Member of this domain - TOPIC
-classifiedByRegion	-r	mr	wn30/own-pt/schema/classifiedByRegion	nvar	synset,word	Member of this domain - REGION
-classifiedByUsage	-u	mu	wn30/own-pt/schema/classifiedByUsage	nvar	synset,word	Member of this domain - USAGE
+derivationallyRelated	+	drf	wn30/own-pt/schema/derivationallyRelated	n,v,a,s	synset,word	Derivationally related form
+classifiedByTopic	-c	mt	wn30/own-pt/schema/classifiedByTopic	n,v,a,r,s	synset,word	Member of this domain - TOPIC
+classifiedByRegion	-r	mr	wn30/own-pt/schema/classifiedByRegion	n,v,a,r,s	synset,word	Member of this domain - REGION
+classifiedByUsage	-u	mu	wn30/own-pt/schema/classifiedByUsage	n,v,a,r,s	synset,word	Member of this domain - USAGE
 classifiesByTopic	;c	dt	wn30/own-pt/schema/classifiesByTopic	n	synset,word	Domain of synset - TOPIC
 classifiesByRegion	;r	dr	wn30/own-pt/schema/classifiesByRegion	n	synset,word	Domain of synset - REGION
 classifiesByUsage	;u	du	wn30/own-pt/schema/classifiesByUsage	n	synset,word	Domain of synset - USAGE
 participleOf	<	pv	wn30/own-pt/schema/participleOf	a	word	Participle of verb
-attribute	=	attr	wn30/own-pt/schema/attribute	na	synset	Attribute
+attribute	=	attr	wn30/own-pt/schema/attribute	n,a	synset	Attribute
 causes	>	cause	wn30/own-pt/schema/causes	v	synset	Cause
-hypernymOf	@	hyper	wn30/own-pt/schema/hypernymOf	nv	synset	Hypernym
+hypernymOf	@	hyper	wn30/own-pt/schema/hypernymOf	n,v	synset	Hypernym
 instanceOf	@i	ihyper	wn30/own-pt/schema/instanceOf	n	synset	Instance Hypernym (9/11 is a terrorist attack)
-pertainsTo	\	pe	wn30/own-pt/schema/pertainsTo	ar	word	Pertainym pertains to noun/adjective
-seeAlso	^	see	wn30/own-pt/schema/seeAlso	va	synset,word	Also see
-hyponymOf	~	hypo	wn30/own-pt/schema/hyponymOf	nv	synset	Hyponym
+pertainsTo	\	pe	wn30/own-pt/schema/pertainsTo	a,r	word	Pertainym pertains to noun/adjective
+seeAlso	^	see	wn30/own-pt/schema/seeAlso	v,a	synset,word	Also see
+hyponymOf	~	hypo	wn30/own-pt/schema/hyponymOf	n,v	synset	Hyponym
 hasInstance	~i	ihypo	wn30/own-pt/schema/hasInstance	n	synset	Instance Hyponym (terrorist attack has instance 9/11)
-lexicographerFile	_	_	wn30/own-pt/schema/lexicographerFile	nvar	synset	lexicographer file relation
-definition	_	d	wn30/own-pt/schema/definition	nvar	synset	definition relation
-example	_	e	wn30/own-pt/schema/example	nvar	synset	example relation
-containsWordSense	_	w	wn30/own-pt/schema/containsWordSense	nvar	synset	contains word sense relation
-frame	_	fs	wn30/own-pt/schema/frame	nvar	synset,word	frame relation
-lexicalForm	_	_	wn30/own-pt/schema/lexicalForm	nvar	synset	lexicographer file relation
+lexicographerFile	_	_	wn30/own-pt/schema/lexicographerFile	n,v,a,r,s	synset	lexicographer file relation
+definition	_	d	wn30/own-pt/schema/definition	n,v,a,r,s	synset	definition relation
+example	_	e	wn30/own-pt/schema/example	n,v,a,r,s	synset	example relation
+containsWordSense	_	w	wn30/own-pt/schema/containsWordSense	n,v,a,r	synset	contains word sense relation
+frame	_	fs	wn30/own-pt/schema/frame	v	synset,word	frame relation
+lexicalForm	_	_	wn30/own-pt/schema/lexicalForm	n,v,a,r,s	synset	lexicographer file relation
 syntacticMarker	_	marker	wn30/own-pt/schema/syntacticMarker	a	word	   marker
\ No newline at end of file
diff --git a/src/Data.hs b/src/Data.hs
index 2279fb5..6e61e5e 100644
--- a/src/Data.hs
+++ b/src/Data.hs
@@ -23,8 +23,29 @@ import Text.Printf (printf)
 singleton :: a -> NonEmpty a
 singleton x = x :| []
 
+data WNObj = SynsetObj | WordObj deriving (Eq,Enum)
+
+instance Show WNObj where
+  show SynsetObj = "synset"
+  show WordObj   = "word"
+
+readWNObj :: Text -> WNObj
+readWNObj input = case input of
+  "synset" -> SynsetObj
+  "word"   -> WordObj
+  _        -> error . T.unpack
+    $ T.intercalate " " ["Can't parse", input, "as WordNet object name (one of synset or word)"]
+
 data WNPOS = A | S | R | N | V deriving (Eq,Enum,Ord,Show)
 
+readWNPOS :: Text -> WNPOS
+readWNPOS "n" = N
+readWNPOS "a" = A
+readWNPOS "r" = R
+readWNPOS "v" = V
+readWNPOS "s" = S
+readWNPOS input = error $ T.unpack input ++ " is not a valid PoS"
+
 newtype LexicographerFileId = LexicographerFileId (WNPOS, Text) deriving (Eq,Ord,Show)
 
 synsetType :: WNPOS -> Int
diff --git a/src/Lib.hs b/src/Lib.hs
index 1837725..8d5816d 100644
--- a/src/Lib.hs
+++ b/src/Lib.hs
@@ -11,7 +11,8 @@ module Lib
 
 import Data ( Synset(..), Unvalidated, Validated
             , Validation(..), SourceValidation, singleton
-            , SourceError(..), WNError(..), SourcePosition(..) )
+            , SourceError(..), WNError(..), SourcePosition(..)
+            , WNObj(..), readWNObj, WNPOS(..), readWNPOS )
 import Export (synsetToTriples,synsetsToSynsetJSONs)
 import Parse (parseLexicographer)
 import Validate ( makeIndex
@@ -56,14 +57,15 @@ data Config = Config
   , textToCanonicNames :: Map Text Text
   -- | maps canonical relation names to their RDF names
   , canonicToRDFNames  :: Map Text Text
+  -- | maps canonic relation names to their possible domains
+  , canonicToDomain    :: Map Text (NonEmpty WNObj, NonEmpty WNPOS)
   -- | contains the filepaths to the files in the WordNet
   , lexFilePaths       :: NonEmpty FilePath
   } deriving (Show,Eq)
 
-readTSV :: Ord a => FilePath -> ([Text] -> Either String [(a, b)]) -> IO (Map a b)
-readTSV filepath readLine = do
-  text <- TIO.readFile filepath
-  case go text of
+readTSV :: Ord a => Text -> ([Text] -> Either String [(a, b)]) -> IO (Map a b)
+readTSV input readLine =
+  case go input of
     ([], pairs) -> return . M.fromList $ concat pairs
     (errors, _) -> mapM_ putStrLn errors >> return M.empty
   where
@@ -81,12 +83,16 @@ readConfig configurationDir' = do
   go configurationDir
   where
     go configurationDir = do
-      lexnamesToId       <- readTSV (configurationDir </> "lexnames.tsv") lexnamesReader
-      -- FIXME: reading file twice, but oh well..
-      textToCanonicNames <- readTSV (configurationDir </> "relations.tsv") textToCanonicNamesReader
-      canonicToRDFNames  <- readTSV (configurationDir </> "relations.tsv") canonicToRDFNamesReader
+      lexNamesInput      <- TIO.readFile $ configurationDir </> "lexnames.tsv"
+      lexnamesToId       <- readTSV lexNamesInput lexnamesReader
+      relationsInput     <- TIO.readFile $ configurationDir </> "relations.tsv"
+      textToCanonicNames <- readTSV relationsInput textToCanonicNamesReader
+      canonicToRDFNames  <- readTSV relationsInput canonicToRDFNamesReader
+      canonicToDomain    <- readTSV relationsInput canonicToDomainReader
       let lexFilePaths = lexFilePaths' lexnamesToId
-      return $ Config {lexnamesToId, textToCanonicNames, canonicToRDFNames, lexFilePaths}
+      return $ Config {lexnamesToId, textToCanonicNames
+                      , canonicToRDFNames, lexFilePaths, canonicToDomain
+                      }
         where
           lexFilePaths' lexNamesMap =
             let lexnames = map normalize (M.keys lexNamesMap)
@@ -105,15 +111,21 @@ readConfig configurationDir' = do
           textToCanonicNamesReader _ = Left "Wrong number of fields in relations.tsv"
           canonicToRDFNamesReader  [canonicName,_,_,rdfName,_,_,_] = Right [(canonicName, rdfName)]
           canonicToRDFNamesReader _ = Left "Wrong number of fields in relations.tsv"
+          canonicToDomainReader [canonicName,_,_,_,pos,domain,_] =
+            Right [(canonicName, ( readListField readWNObj domain
+                                 , readListField readWNPOS pos))]
+          canonicToDomainReader _ = Left "wrong number of Fiels in relations.tsv"
+          readListField f = NE.fromList . map (f . T.strip) . T.splitOn ","
+                                                                 
 
 type App = ReaderT Config IO
 
 parseLexicographerFile :: FilePath -> App (SourceValidation (NonEmpty (Synset Unvalidated)))
 parseLexicographerFile filePath = do
-  Config{textToCanonicNames} <- ask
+  Config{textToCanonicNames, canonicToDomain} <- ask
   liftIO $ do
     content <- TIO.readFile $ normalise filePath
-    let result = parseLexicographer textToCanonicNames filePath content
+    let result = parseLexicographer textToCanonicNames canonicToDomain filePath content
     return result
 
 parseLexicographerFiles ::  NonEmpty FilePath
diff --git a/src/Parse.hs b/src/Parse.hs
index a822be0..f9c90be 100644
--- a/src/Parse.hs
+++ b/src/Parse.hs
@@ -22,19 +22,22 @@ import qualified Text.Megaparsec.Char.Lexer as L
 --import Text.Megaparsec.Debug (dbg)
 import qualified Data.Set as S
 
+fst3 :: (a,b,c) -> a
+fst3 (first,_,_) = first
 
 type RawSynset = Either (ParseError Text Void) (Synset Unvalidated)
 
 -- State stores in which lexicographer file we're in, this is useful
 -- to fill in implicit references
-type Parser = ParsecT Void Text (Reader (LexicographerFileId, Map Text Text))
+type Parser = ParsecT Void Text (Reader (LexicographerFileId, Map Text Text, Map Text (NonEmpty WNObj, NonEmpty WNPOS)))
 
 parseLexicographer :: Map Text Text
+  -> Map Text (NonEmpty WNObj, NonEmpty WNPOS)
   -> String -> Text
   -> SourceValidation (NonEmpty (Synset Unvalidated))
-parseLexicographer relationsMap fileName inputText =
+parseLexicographer textToCanonicNames canonicToDomain fileName inputText =
   case runReader (runParserT lexicographerFile fileName inputText)
-                 (lexFileId, relationsMap) of
+                 (lexFileId, textToCanonicNames, canonicToDomain) of
     Right rawSynsets
       -> case partitionEithers (NE.toList rawSynsets) of
            ([], synsetsToValidate) -> Success $ NE.fromList synsetsToValidate
@@ -89,7 +92,7 @@ synsets = synsetOrError `NC.sepEndBy1` many linebreak
 synset :: Parser (Synset Unvalidated)
 synset = do
   startOffset      <- (1 +) <$> getOffset
-  lexicographerId  <- reader fst
+  lexicographerId  <- reader fst3
   synsetWordSenses <- wordSenseStatement `NC.endBy1` linebreak
   synsetDefinition <- definitionStatement <* linebreak
   synsetExamples   <- exampleStatement `endBy` linebreak
@@ -129,21 +132,30 @@ synsetRelationStatement :: Parser SynsetRelation
 synsetRelationStatement = L.nonIndented spaceConsumer go
   where
     go = SynsetRelation
-      <$> relationNameP synsetRelationName
+      <$> relationNameP SynsetObj synsetRelationName
       <*> (SynsetIdentifier <$> identifier)
     synsetRelationName = T.stripEnd
       -- [ ] handle this better
       <$> (takeWhile1P Nothing (`notElem` [':', ' ', '\n']) <?> "Synset relation name")
       <* symbol ":"
 
-relationNameP :: Parser Text -> Parser Text
-relationNameP name = do
-  relationsMap <- reader snd
-  relationName <- name
+relationNameP :: WNObj -> Parser Text -> Parser Text
+relationNameP obj name = do
+  LexicographerFileId (wnPOS, _) <- reader fst3
+  relationsMap     <- reader $ \(_,second,_) -> second
+  relationsDomains <- reader $ \(_,_,third) -> third
+  relationName     <- name
+  let relationString = T.unpack relationName
   case M.lookup relationName relationsMap of
-    Just _ -> if relationName `elem` ["d", "e", "fs", "w"]
+    Just canonicName -> if relationName `elem` ["d", "e", "fs", "w"]
               then fail "Synset components must come in the following order: words, definition, examples, frames, and synset relations"
-              else return relationName
+              else case M.lookup canonicName relationsDomains of
+                     Just (domain,poses) -> case (obj `elem` domain, wnPOS `elem` poses) of
+                       (True,True)  -> return relationName
+                       _            -> fail $ relationString
+                                            ++ " is not a valid relation for a " ++ show obj
+                                            ++ " with PoS " ++ show wnPOS
+                     Nothing -> error $ "Can't find " ++ T.unpack relationName ++ "'s domain"
     Nothing -> failure (Just $ toErrorItem relationName)
                        (S.fromList . map toErrorItem $ M.keys relationsMap)
   where
@@ -163,7 +175,7 @@ wordSenseIdentifier = WordSenseIdentifier <$> identifier
 identifier :: Parser (LexicographerFileId, WordSenseForm, LexicalId)
 identifier =
   (,,)
-  <$>  (try lexicographerIdentifier <|> reader fst)
+  <$>  (try lexicographerIdentifier <|> reader fst3)
   <*>  fmap WordSenseForm word <*> lexicalIdentifier
   where
     lexicographerIdentifier = do
@@ -175,7 +187,7 @@ wordSensePointers :: Parser [WordPointer]
 wordSensePointers = many go
   where
     go =  WordPointer
-      <$> relationNameP (word <?> "Word pointer")
+      <$> relationNameP WordObj (word <?> "Word pointer")
       <*> wordSenseIdentifier
 
 word :: Parser Text