From 48181c79ea6fb9579604751e8ec7ab6cca2c987a Mon Sep 17 00:00:00 2001 From: odanoburu Date: Fri, 30 Aug 2019 17:32:52 -0300 Subject: [PATCH] * relations.tsv - use commas for pos column - add satellite adjectives PoS * validation - check that a relation has the correct domain (either synset or wordsense of the appropriate PoS) obs: maybe this is too much to check in the parser phase and we should defer it to validation? (the parser is not so easy to read now) related to #6 --- relations.tsv | 34 +++++++++++++++++----------------- src/Data.hs | 21 +++++++++++++++++++++ src/Lib.hs | 36 ++++++++++++++++++++++++------------ src/Parse.hs | 38 +++++++++++++++++++++++++------------- 4 files changed, 87 insertions(+), 42 deletions(-) diff --git a/relations.tsv b/relations.tsv index f8f1470..ae7c4ae 100644 --- a/relations.tsv +++ b/relations.tsv @@ -1,5 +1,5 @@ name lexicographer file text file rdf pos domain description -antonymOf ! ant wn30/own-pt/schema/antonymOf nvar word Antonym +antonymOf ! ant wn30/own-pt/schema/antonymOf n,v,a,r word Antonym memberHolonymOf #m hm wn30/own-pt/schema/memberHolonymOf n synset Member holonym partHolonymOf #p hp wn30/own-pt/schema/partHolonymOf n synset Part holonym substanceHolonymOf #s hs wn30/own-pt/schema/substanceHolonymOf n synset Substance holonym @@ -7,28 +7,28 @@ sameVerbGroupAs $ vg wn30/own-pt/schema/sameVerbGroupAs v synset,word Verb Group memberMeronymOf %m mm wn30/own-pt/schema/memberMeronymOf n synset Member meronym partMeronymOf %p mp wn30/own-pt/schema/partMeronymOf n synset Part meronym substanceMeronymOf %s ms wn30/own-pt/schema/substanceMeronymOf n synset Substance meronym -similarTo & sim wn30/own-pt/schema/similarTo a synset Similar to +similarTo & sim wn30/own-pt/schema/similarTo a,s synset Similar to entails * entail wn30/own-pt/schema/entails v synset entailment -derivationallyRelated + drf wn30/own-pt/schema/derivationallyRelated nv synset,word Derivationally related form -classifiedByTopic -c mt wn30/own-pt/schema/classifiedByTopic nvar synset,word Member of this domain - TOPIC -classifiedByRegion -r mr wn30/own-pt/schema/classifiedByRegion nvar synset,word Member of this domain - REGION -classifiedByUsage -u mu wn30/own-pt/schema/classifiedByUsage nvar synset,word Member of this domain - USAGE +derivationallyRelated + drf wn30/own-pt/schema/derivationallyRelated n,v,a,s synset,word Derivationally related form +classifiedByTopic -c mt wn30/own-pt/schema/classifiedByTopic n,v,a,r,s synset,word Member of this domain - TOPIC +classifiedByRegion -r mr wn30/own-pt/schema/classifiedByRegion n,v,a,r,s synset,word Member of this domain - REGION +classifiedByUsage -u mu wn30/own-pt/schema/classifiedByUsage n,v,a,r,s synset,word Member of this domain - USAGE classifiesByTopic ;c dt wn30/own-pt/schema/classifiesByTopic n synset,word Domain of synset - TOPIC classifiesByRegion ;r dr wn30/own-pt/schema/classifiesByRegion n synset,word Domain of synset - REGION classifiesByUsage ;u du wn30/own-pt/schema/classifiesByUsage n synset,word Domain of synset - USAGE participleOf < pv wn30/own-pt/schema/participleOf a word Participle of verb -attribute = attr wn30/own-pt/schema/attribute na synset Attribute +attribute = attr wn30/own-pt/schema/attribute n,a synset Attribute causes > cause wn30/own-pt/schema/causes v synset Cause -hypernymOf @ hyper wn30/own-pt/schema/hypernymOf nv synset Hypernym +hypernymOf @ hyper wn30/own-pt/schema/hypernymOf n,v synset Hypernym instanceOf @i ihyper wn30/own-pt/schema/instanceOf n synset Instance Hypernym (9/11 is a terrorist attack) -pertainsTo \ pe wn30/own-pt/schema/pertainsTo ar word Pertainym pertains to noun/adjective -seeAlso ^ see wn30/own-pt/schema/seeAlso va synset,word Also see -hyponymOf ~ hypo wn30/own-pt/schema/hyponymOf nv synset Hyponym +pertainsTo \ pe wn30/own-pt/schema/pertainsTo a,r word Pertainym pertains to noun/adjective +seeAlso ^ see wn30/own-pt/schema/seeAlso v,a synset,word Also see +hyponymOf ~ hypo wn30/own-pt/schema/hyponymOf n,v synset Hyponym hasInstance ~i ihypo wn30/own-pt/schema/hasInstance n synset Instance Hyponym (terrorist attack has instance 9/11) -lexicographerFile _ _ wn30/own-pt/schema/lexicographerFile nvar synset lexicographer file relation -definition _ d wn30/own-pt/schema/definition nvar synset definition relation -example _ e wn30/own-pt/schema/example nvar synset example relation -containsWordSense _ w wn30/own-pt/schema/containsWordSense nvar synset contains word sense relation -frame _ fs wn30/own-pt/schema/frame nvar synset,word frame relation -lexicalForm _ _ wn30/own-pt/schema/lexicalForm nvar synset lexicographer file relation +lexicographerFile _ _ wn30/own-pt/schema/lexicographerFile n,v,a,r,s synset lexicographer file relation +definition _ d wn30/own-pt/schema/definition n,v,a,r,s synset definition relation +example _ e wn30/own-pt/schema/example n,v,a,r,s synset example relation +containsWordSense _ w wn30/own-pt/schema/containsWordSense n,v,a,r synset contains word sense relation +frame _ fs wn30/own-pt/schema/frame v synset,word frame relation +lexicalForm _ _ wn30/own-pt/schema/lexicalForm n,v,a,r,s synset lexicographer file relation syntacticMarker _ marker wn30/own-pt/schema/syntacticMarker a word marker \ No newline at end of file diff --git a/src/Data.hs b/src/Data.hs index 2279fb5..6e61e5e 100644 --- a/src/Data.hs +++ b/src/Data.hs @@ -23,8 +23,29 @@ import Text.Printf (printf) singleton :: a -> NonEmpty a singleton x = x :| [] +data WNObj = SynsetObj | WordObj deriving (Eq,Enum) + +instance Show WNObj where + show SynsetObj = "synset" + show WordObj = "word" + +readWNObj :: Text -> WNObj +readWNObj input = case input of + "synset" -> SynsetObj + "word" -> WordObj + _ -> error . T.unpack + $ T.intercalate " " ["Can't parse", input, "as WordNet object name (one of synset or word)"] + data WNPOS = A | S | R | N | V deriving (Eq,Enum,Ord,Show) +readWNPOS :: Text -> WNPOS +readWNPOS "n" = N +readWNPOS "a" = A +readWNPOS "r" = R +readWNPOS "v" = V +readWNPOS "s" = S +readWNPOS input = error $ T.unpack input ++ " is not a valid PoS" + newtype LexicographerFileId = LexicographerFileId (WNPOS, Text) deriving (Eq,Ord,Show) synsetType :: WNPOS -> Int diff --git a/src/Lib.hs b/src/Lib.hs index 1837725..8d5816d 100644 --- a/src/Lib.hs +++ b/src/Lib.hs @@ -11,7 +11,8 @@ module Lib import Data ( Synset(..), Unvalidated, Validated , Validation(..), SourceValidation, singleton - , SourceError(..), WNError(..), SourcePosition(..) ) + , SourceError(..), WNError(..), SourcePosition(..) + , WNObj(..), readWNObj, WNPOS(..), readWNPOS ) import Export (synsetToTriples,synsetsToSynsetJSONs) import Parse (parseLexicographer) import Validate ( makeIndex @@ -56,14 +57,15 @@ data Config = Config , textToCanonicNames :: Map Text Text -- | maps canonical relation names to their RDF names , canonicToRDFNames :: Map Text Text + -- | maps canonic relation names to their possible domains + , canonicToDomain :: Map Text (NonEmpty WNObj, NonEmpty WNPOS) -- | contains the filepaths to the files in the WordNet , lexFilePaths :: NonEmpty FilePath } deriving (Show,Eq) -readTSV :: Ord a => FilePath -> ([Text] -> Either String [(a, b)]) -> IO (Map a b) -readTSV filepath readLine = do - text <- TIO.readFile filepath - case go text of +readTSV :: Ord a => Text -> ([Text] -> Either String [(a, b)]) -> IO (Map a b) +readTSV input readLine = + case go input of ([], pairs) -> return . M.fromList $ concat pairs (errors, _) -> mapM_ putStrLn errors >> return M.empty where @@ -81,12 +83,16 @@ readConfig configurationDir' = do go configurationDir where go configurationDir = do - lexnamesToId <- readTSV (configurationDir "lexnames.tsv") lexnamesReader - -- FIXME: reading file twice, but oh well.. - textToCanonicNames <- readTSV (configurationDir "relations.tsv") textToCanonicNamesReader - canonicToRDFNames <- readTSV (configurationDir "relations.tsv") canonicToRDFNamesReader + lexNamesInput <- TIO.readFile $ configurationDir "lexnames.tsv" + lexnamesToId <- readTSV lexNamesInput lexnamesReader + relationsInput <- TIO.readFile $ configurationDir "relations.tsv" + textToCanonicNames <- readTSV relationsInput textToCanonicNamesReader + canonicToRDFNames <- readTSV relationsInput canonicToRDFNamesReader + canonicToDomain <- readTSV relationsInput canonicToDomainReader let lexFilePaths = lexFilePaths' lexnamesToId - return $ Config {lexnamesToId, textToCanonicNames, canonicToRDFNames, lexFilePaths} + return $ Config {lexnamesToId, textToCanonicNames + , canonicToRDFNames, lexFilePaths, canonicToDomain + } where lexFilePaths' lexNamesMap = let lexnames = map normalize (M.keys lexNamesMap) @@ -105,15 +111,21 @@ readConfig configurationDir' = do textToCanonicNamesReader _ = Left "Wrong number of fields in relations.tsv" canonicToRDFNamesReader [canonicName,_,_,rdfName,_,_,_] = Right [(canonicName, rdfName)] canonicToRDFNamesReader _ = Left "Wrong number of fields in relations.tsv" + canonicToDomainReader [canonicName,_,_,_,pos,domain,_] = + Right [(canonicName, ( readListField readWNObj domain + , readListField readWNPOS pos))] + canonicToDomainReader _ = Left "wrong number of Fiels in relations.tsv" + readListField f = NE.fromList . map (f . T.strip) . T.splitOn "," + type App = ReaderT Config IO parseLexicographerFile :: FilePath -> App (SourceValidation (NonEmpty (Synset Unvalidated))) parseLexicographerFile filePath = do - Config{textToCanonicNames} <- ask + Config{textToCanonicNames, canonicToDomain} <- ask liftIO $ do content <- TIO.readFile $ normalise filePath - let result = parseLexicographer textToCanonicNames filePath content + let result = parseLexicographer textToCanonicNames canonicToDomain filePath content return result parseLexicographerFiles :: NonEmpty FilePath diff --git a/src/Parse.hs b/src/Parse.hs index a822be0..f9c90be 100644 --- a/src/Parse.hs +++ b/src/Parse.hs @@ -22,19 +22,22 @@ import qualified Text.Megaparsec.Char.Lexer as L --import Text.Megaparsec.Debug (dbg) import qualified Data.Set as S +fst3 :: (a,b,c) -> a +fst3 (first,_,_) = first type RawSynset = Either (ParseError Text Void) (Synset Unvalidated) -- State stores in which lexicographer file we're in, this is useful -- to fill in implicit references -type Parser = ParsecT Void Text (Reader (LexicographerFileId, Map Text Text)) +type Parser = ParsecT Void Text (Reader (LexicographerFileId, Map Text Text, Map Text (NonEmpty WNObj, NonEmpty WNPOS))) parseLexicographer :: Map Text Text + -> Map Text (NonEmpty WNObj, NonEmpty WNPOS) -> String -> Text -> SourceValidation (NonEmpty (Synset Unvalidated)) -parseLexicographer relationsMap fileName inputText = +parseLexicographer textToCanonicNames canonicToDomain fileName inputText = case runReader (runParserT lexicographerFile fileName inputText) - (lexFileId, relationsMap) of + (lexFileId, textToCanonicNames, canonicToDomain) of Right rawSynsets -> case partitionEithers (NE.toList rawSynsets) of ([], synsetsToValidate) -> Success $ NE.fromList synsetsToValidate @@ -89,7 +92,7 @@ synsets = synsetOrError `NC.sepEndBy1` many linebreak synset :: Parser (Synset Unvalidated) synset = do startOffset <- (1 +) <$> getOffset - lexicographerId <- reader fst + lexicographerId <- reader fst3 synsetWordSenses <- wordSenseStatement `NC.endBy1` linebreak synsetDefinition <- definitionStatement <* linebreak synsetExamples <- exampleStatement `endBy` linebreak @@ -129,21 +132,30 @@ synsetRelationStatement :: Parser SynsetRelation synsetRelationStatement = L.nonIndented spaceConsumer go where go = SynsetRelation - <$> relationNameP synsetRelationName + <$> relationNameP SynsetObj synsetRelationName <*> (SynsetIdentifier <$> identifier) synsetRelationName = T.stripEnd -- [ ] handle this better <$> (takeWhile1P Nothing (`notElem` [':', ' ', '\n']) "Synset relation name") <* symbol ":" -relationNameP :: Parser Text -> Parser Text -relationNameP name = do - relationsMap <- reader snd - relationName <- name +relationNameP :: WNObj -> Parser Text -> Parser Text +relationNameP obj name = do + LexicographerFileId (wnPOS, _) <- reader fst3 + relationsMap <- reader $ \(_,second,_) -> second + relationsDomains <- reader $ \(_,_,third) -> third + relationName <- name + let relationString = T.unpack relationName case M.lookup relationName relationsMap of - Just _ -> if relationName `elem` ["d", "e", "fs", "w"] + Just canonicName -> if relationName `elem` ["d", "e", "fs", "w"] then fail "Synset components must come in the following order: words, definition, examples, frames, and synset relations" - else return relationName + else case M.lookup canonicName relationsDomains of + Just (domain,poses) -> case (obj `elem` domain, wnPOS `elem` poses) of + (True,True) -> return relationName + _ -> fail $ relationString + ++ " is not a valid relation for a " ++ show obj + ++ " with PoS " ++ show wnPOS + Nothing -> error $ "Can't find " ++ T.unpack relationName ++ "'s domain" Nothing -> failure (Just $ toErrorItem relationName) (S.fromList . map toErrorItem $ M.keys relationsMap) where @@ -163,7 +175,7 @@ wordSenseIdentifier = WordSenseIdentifier <$> identifier identifier :: Parser (LexicographerFileId, WordSenseForm, LexicalId) identifier = (,,) - <$> (try lexicographerIdentifier <|> reader fst) + <$> (try lexicographerIdentifier <|> reader fst3) <*> fmap WordSenseForm word <*> lexicalIdentifier where lexicographerIdentifier = do @@ -175,7 +187,7 @@ wordSensePointers :: Parser [WordPointer] wordSensePointers = many go where go = WordPointer - <$> relationNameP (word "Word pointer") + <$> relationNameP WordObj (word "Word pointer") <*> wordSenseIdentifier word :: Parser Text