Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

check relations domains #26

Merged
merged 1 commit into from
Aug 30, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 17 additions & 17 deletions relations.tsv
Original file line number Diff line number Diff line change
@@ -1,34 +1,34 @@
name lexicographer file text file rdf pos domain description
antonymOf ! ant wn30/own-pt/schema/antonymOf nvar word Antonym
antonymOf ! ant wn30/own-pt/schema/antonymOf n,v,a,r word Antonym
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

algum problema com essa mudança? imagino que não mas é bom checar rs

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tudo top!

memberHolonymOf #m hm wn30/own-pt/schema/memberHolonymOf n synset Member holonym
partHolonymOf #p hp wn30/own-pt/schema/partHolonymOf n synset Part holonym
substanceHolonymOf #s hs wn30/own-pt/schema/substanceHolonymOf n synset Substance holonym
sameVerbGroupAs $ vg wn30/own-pt/schema/sameVerbGroupAs v synset,word Verb Group
memberMeronymOf %m mm wn30/own-pt/schema/memberMeronymOf n synset Member meronym
partMeronymOf %p mp wn30/own-pt/schema/partMeronymOf n synset Part meronym
substanceMeronymOf %s ms wn30/own-pt/schema/substanceMeronymOf n synset Substance meronym
similarTo & sim wn30/own-pt/schema/similarTo a synset Similar to
similarTo & sim wn30/own-pt/schema/similarTo a,s synset Similar to
entails * entail wn30/own-pt/schema/entails v synset entailment
derivationallyRelated + drf wn30/own-pt/schema/derivationallyRelated nv synset,word Derivationally related form
classifiedByTopic -c mt wn30/own-pt/schema/classifiedByTopic nvar synset,word Member of this domain - TOPIC
classifiedByRegion -r mr wn30/own-pt/schema/classifiedByRegion nvar synset,word Member of this domain - REGION
classifiedByUsage -u mu wn30/own-pt/schema/classifiedByUsage nvar synset,word Member of this domain - USAGE
derivationallyRelated + drf wn30/own-pt/schema/derivationallyRelated n,v,a,s synset,word Derivationally related form
classifiedByTopic -c mt wn30/own-pt/schema/classifiedByTopic n,v,a,r,s synset,word Member of this domain - TOPIC
classifiedByRegion -r mr wn30/own-pt/schema/classifiedByRegion n,v,a,r,s synset,word Member of this domain - REGION
classifiedByUsage -u mu wn30/own-pt/schema/classifiedByUsage n,v,a,r,s synset,word Member of this domain - USAGE
classifiesByTopic ;c dt wn30/own-pt/schema/classifiesByTopic n synset,word Domain of synset - TOPIC
classifiesByRegion ;r dr wn30/own-pt/schema/classifiesByRegion n synset,word Domain of synset - REGION
classifiesByUsage ;u du wn30/own-pt/schema/classifiesByUsage n synset,word Domain of synset - USAGE
participleOf < pv wn30/own-pt/schema/participleOf a word Participle of verb
attribute = attr wn30/own-pt/schema/attribute na synset Attribute
attribute = attr wn30/own-pt/schema/attribute n,a synset Attribute
causes > cause wn30/own-pt/schema/causes v synset Cause
hypernymOf @ hyper wn30/own-pt/schema/hypernymOf nv synset Hypernym
hypernymOf @ hyper wn30/own-pt/schema/hypernymOf n,v synset Hypernym
instanceOf @i ihyper wn30/own-pt/schema/instanceOf n synset Instance Hypernym (9/11 is a terrorist attack)
pertainsTo \ pe wn30/own-pt/schema/pertainsTo ar word Pertainym pertains to noun/adjective
seeAlso ^ see wn30/own-pt/schema/seeAlso va synset,word Also see
hyponymOf ~ hypo wn30/own-pt/schema/hyponymOf nv synset Hyponym
pertainsTo \ pe wn30/own-pt/schema/pertainsTo a,r word Pertainym pertains to noun/adjective
seeAlso ^ see wn30/own-pt/schema/seeAlso v,a synset,word Also see
hyponymOf ~ hypo wn30/own-pt/schema/hyponymOf n,v synset Hyponym
hasInstance ~i ihypo wn30/own-pt/schema/hasInstance n synset Instance Hyponym (terrorist attack has instance 9/11)
lexicographerFile _ _ wn30/own-pt/schema/lexicographerFile nvar synset lexicographer file relation
definition _ d wn30/own-pt/schema/definition nvar synset definition relation
example _ e wn30/own-pt/schema/example nvar synset example relation
containsWordSense _ w wn30/own-pt/schema/containsWordSense nvar synset contains word sense relation
frame _ fs wn30/own-pt/schema/frame nvar synset,word frame relation
lexicalForm _ _ wn30/own-pt/schema/lexicalForm nvar synset lexicographer file relation
lexicographerFile _ _ wn30/own-pt/schema/lexicographerFile n,v,a,r,s synset lexicographer file relation
definition _ d wn30/own-pt/schema/definition n,v,a,r,s synset definition relation
example _ e wn30/own-pt/schema/example n,v,a,r,s synset example relation
containsWordSense _ w wn30/own-pt/schema/containsWordSense n,v,a,r synset contains word sense relation
frame _ fs wn30/own-pt/schema/frame v synset,word frame relation
lexicalForm _ _ wn30/own-pt/schema/lexicalForm n,v,a,r,s synset lexicographer file relation
syntacticMarker _ marker wn30/own-pt/schema/syntacticMarker a word marker
21 changes: 21 additions & 0 deletions src/Data.hs
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,29 @@ import Text.Printf (printf)
singleton :: a -> NonEmpty a
singleton x = x :| []

data WNObj = SynsetObj | WordObj deriving (Eq,Enum)

instance Show WNObj where
show SynsetObj = "synset"
show WordObj = "word"

readWNObj :: Text -> WNObj
readWNObj input = case input of
"synset" -> SynsetObj
"word" -> WordObj
_ -> error . T.unpack
$ T.intercalate " " ["Can't parse", input, "as WordNet object name (one of synset or word)"]

data WNPOS = A | S | R | N | V deriving (Eq,Enum,Ord,Show)

readWNPOS :: Text -> WNPOS
readWNPOS "n" = N
readWNPOS "a" = A
readWNPOS "r" = R
readWNPOS "v" = V
readWNPOS "s" = S
readWNPOS input = error $ T.unpack input ++ " is not a valid PoS"

newtype LexicographerFileId = LexicographerFileId (WNPOS, Text) deriving (Eq,Ord,Show)

synsetType :: WNPOS -> Int
Expand Down
36 changes: 24 additions & 12 deletions src/Lib.hs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ module Lib

import Data ( Synset(..), Unvalidated, Validated
, Validation(..), SourceValidation, singleton
, SourceError(..), WNError(..), SourcePosition(..) )
, SourceError(..), WNError(..), SourcePosition(..)
, WNObj(..), readWNObj, WNPOS(..), readWNPOS )
import Export (synsetToTriples,synsetsToSynsetJSONs)
import Parse (parseLexicographer)
import Validate ( makeIndex
Expand Down Expand Up @@ -56,14 +57,15 @@ data Config = Config
, textToCanonicNames :: Map Text Text
-- | maps canonical relation names to their RDF names
, canonicToRDFNames :: Map Text Text
-- | maps canonic relation names to their possible domains
, canonicToDomain :: Map Text (NonEmpty WNObj, NonEmpty WNPOS)
-- | contains the filepaths to the files in the WordNet
, lexFilePaths :: NonEmpty FilePath
} deriving (Show,Eq)

readTSV :: Ord a => FilePath -> ([Text] -> Either String [(a, b)]) -> IO (Map a b)
readTSV filepath readLine = do
text <- TIO.readFile filepath
case go text of
readTSV :: Ord a => Text -> ([Text] -> Either String [(a, b)]) -> IO (Map a b)
readTSV input readLine =
case go input of
([], pairs) -> return . M.fromList $ concat pairs
(errors, _) -> mapM_ putStrLn errors >> return M.empty
where
Expand All @@ -81,12 +83,16 @@ readConfig configurationDir' = do
go configurationDir
where
go configurationDir = do
lexnamesToId <- readTSV (configurationDir </> "lexnames.tsv") lexnamesReader
-- FIXME: reading file twice, but oh well..
textToCanonicNames <- readTSV (configurationDir </> "relations.tsv") textToCanonicNamesReader
canonicToRDFNames <- readTSV (configurationDir </> "relations.tsv") canonicToRDFNamesReader
lexNamesInput <- TIO.readFile $ configurationDir </> "lexnames.tsv"
lexnamesToId <- readTSV lexNamesInput lexnamesReader
relationsInput <- TIO.readFile $ configurationDir </> "relations.tsv"
textToCanonicNames <- readTSV relationsInput textToCanonicNamesReader
canonicToRDFNames <- readTSV relationsInput canonicToRDFNamesReader
canonicToDomain <- readTSV relationsInput canonicToDomainReader
let lexFilePaths = lexFilePaths' lexnamesToId
return $ Config {lexnamesToId, textToCanonicNames, canonicToRDFNames, lexFilePaths}
return $ Config {lexnamesToId, textToCanonicNames
, canonicToRDFNames, lexFilePaths, canonicToDomain
}
where
lexFilePaths' lexNamesMap =
let lexnames = map normalize (M.keys lexNamesMap)
Expand All @@ -105,15 +111,21 @@ readConfig configurationDir' = do
textToCanonicNamesReader _ = Left "Wrong number of fields in relations.tsv"
canonicToRDFNamesReader [canonicName,_,_,rdfName,_,_,_] = Right [(canonicName, rdfName)]
canonicToRDFNamesReader _ = Left "Wrong number of fields in relations.tsv"
canonicToDomainReader [canonicName,_,_,_,pos,domain,_] =
Right [(canonicName, ( readListField readWNObj domain
, readListField readWNPOS pos))]
canonicToDomainReader _ = Left "wrong number of Fiels in relations.tsv"
readListField f = NE.fromList . map (f . T.strip) . T.splitOn ","


type App = ReaderT Config IO

parseLexicographerFile :: FilePath -> App (SourceValidation (NonEmpty (Synset Unvalidated)))
parseLexicographerFile filePath = do
Config{textToCanonicNames} <- ask
Config{textToCanonicNames, canonicToDomain} <- ask
liftIO $ do
content <- TIO.readFile $ normalise filePath
let result = parseLexicographer textToCanonicNames filePath content
let result = parseLexicographer textToCanonicNames canonicToDomain filePath content
return result

parseLexicographerFiles :: NonEmpty FilePath
Expand Down
38 changes: 25 additions & 13 deletions src/Parse.hs
Original file line number Diff line number Diff line change
Expand Up @@ -22,19 +22,22 @@ import qualified Text.Megaparsec.Char.Lexer as L
--import Text.Megaparsec.Debug (dbg)
import qualified Data.Set as S

fst3 :: (a,b,c) -> a
fst3 (first,_,_) = first

type RawSynset = Either (ParseError Text Void) (Synset Unvalidated)

-- State stores in which lexicographer file we're in, this is useful
-- to fill in implicit references
type Parser = ParsecT Void Text (Reader (LexicographerFileId, Map Text Text))
type Parser = ParsecT Void Text (Reader (LexicographerFileId, Map Text Text, Map Text (NonEmpty WNObj, NonEmpty WNPOS)))

parseLexicographer :: Map Text Text
-> Map Text (NonEmpty WNObj, NonEmpty WNPOS)
-> String -> Text
-> SourceValidation (NonEmpty (Synset Unvalidated))
parseLexicographer relationsMap fileName inputText =
parseLexicographer textToCanonicNames canonicToDomain fileName inputText =
case runReader (runParserT lexicographerFile fileName inputText)
(lexFileId, relationsMap) of
(lexFileId, textToCanonicNames, canonicToDomain) of
Right rawSynsets
-> case partitionEithers (NE.toList rawSynsets) of
([], synsetsToValidate) -> Success $ NE.fromList synsetsToValidate
Expand Down Expand Up @@ -89,7 +92,7 @@ synsets = synsetOrError `NC.sepEndBy1` many linebreak
synset :: Parser (Synset Unvalidated)
synset = do
startOffset <- (1 +) <$> getOffset
lexicographerId <- reader fst
lexicographerId <- reader fst3
synsetWordSenses <- wordSenseStatement `NC.endBy1` linebreak
synsetDefinition <- definitionStatement <* linebreak
synsetExamples <- exampleStatement `endBy` linebreak
Expand Down Expand Up @@ -129,21 +132,30 @@ synsetRelationStatement :: Parser SynsetRelation
synsetRelationStatement = L.nonIndented spaceConsumer go
where
go = SynsetRelation
<$> relationNameP synsetRelationName
<$> relationNameP SynsetObj synsetRelationName
<*> (SynsetIdentifier <$> identifier)
synsetRelationName = T.stripEnd
-- [ ] handle this better
<$> (takeWhile1P Nothing (`notElem` [':', ' ', '\n']) <?> "Synset relation name")
<* symbol ":"

relationNameP :: Parser Text -> Parser Text
relationNameP name = do
relationsMap <- reader snd
relationName <- name
relationNameP :: WNObj -> Parser Text -> Parser Text
relationNameP obj name = do
LexicographerFileId (wnPOS, _) <- reader fst3
relationsMap <- reader $ \(_,second,_) -> second
relationsDomains <- reader $ \(_,_,third) -> third
relationName <- name
let relationString = T.unpack relationName
case M.lookup relationName relationsMap of
Just _ -> if relationName `elem` ["d", "e", "fs", "w"]
Just canonicName -> if relationName `elem` ["d", "e", "fs", "w"]
then fail "Synset components must come in the following order: words, definition, examples, frames, and synset relations"
else return relationName
else case M.lookup canonicName relationsDomains of
Just (domain,poses) -> case (obj `elem` domain, wnPOS `elem` poses) of
(True,True) -> return relationName
_ -> fail $ relationString
++ " is not a valid relation for a " ++ show obj
++ " with PoS " ++ show wnPOS
Nothing -> error $ "Can't find " ++ T.unpack relationName ++ "'s domain"
Nothing -> failure (Just $ toErrorItem relationName)
(S.fromList . map toErrorItem $ M.keys relationsMap)
where
Expand All @@ -163,7 +175,7 @@ wordSenseIdentifier = WordSenseIdentifier <$> identifier
identifier :: Parser (LexicographerFileId, WordSenseForm, LexicalId)
identifier =
(,,)
<$> (try lexicographerIdentifier <|> reader fst)
<$> (try lexicographerIdentifier <|> reader fst3)
<*> fmap WordSenseForm word <*> lexicalIdentifier
where
lexicographerIdentifier = do
Expand All @@ -175,7 +187,7 @@ wordSensePointers :: Parser [WordPointer]
wordSensePointers = many go
where
go = WordPointer
<$> relationNameP (word <?> "Word pointer")
<$> relationNameP WordObj (word <?> "Word pointer")
<*> wordSenseIdentifier

word :: Parser Text
Expand Down