Aelve Codesearch

grep over package repositories
concrete-haskell-0.1.0.16
src/Data/Concrete/Parsers.hs
{-# LANGUAGE DeriveGeneric, OverloadedStrings #-}
{-|
Description : Parsers and utilities for ingesting various formats to Concrete

This module is designed to enable converting arbitrary formats
described as simple context-free (or even context-sensitive)
grammars into Concrete Communication objects.

A base UUID is generated randomly for each Communication, and subsequent UUIDs
within the Communication are increments of the base: this allows for more
efficient compression.
-}
module Data.Concrete.Parsers
       ( communicationParsers
       , ingest
       ) where

import qualified Data.Map as Map
import Data.Vector (fromList, Vector)
import Data.Map (Map)
import Data.Concrete.Utils (createAnnotationMetadata, getUUID, writeCommunication)
import System.IO (stdin, stdout, stderr, openFile, Handle, IOMode(..), hPutStrLn)
import Control.Monad.State (runStateT)
import Data.ByteString.Lazy (ByteString)
import Data.Text.Lazy (Text, pack, unpack)
import Data.Concrete.Autogen.Communication_Types (default_Communication, Communication(..))
import Data.Concrete.Parsers.Types
import Control.Monad.IO.Class (liftIO)
import Text.Megaparsec.Char (space)
import Text.Megaparsec (runParserT', initialPos, State(..), mkPos, parseErrorPretty, eof, ParsecT)
import qualified Data.List.NonEmpty as NE
import Data.Void (Void)
import Data.Vector (Vector, fromList, snoc, empty)
import Data.Concrete.Parsers.Utils (finalizeCommunication)
import qualified Data.Concrete.Parsers.JSON as JSON
import Control.Monad.Identity (Identity(..))
import Data.Conduit.List (unfold)
import Conduit

import Text.Printf (printf)
import qualified Data.Concrete.Parsers.CONLL as CONLL
import qualified Data.Concrete.Parsers.CSV as CSV
import qualified Data.Concrete.Parsers.PTB as PTB
import qualified Data.Concrete.Parsers.HTML as HTML
import qualified Data.Concrete.Parsers.XML as XML
import qualified Data.Concrete.Parsers.Email as Email
import Control.Monad (void, join, liftM)
import Data.Text.Lazy.Encoding (decodeUtf8)

-- | List of ingest configurations and default parameters
communicationParsers = [( "JSON-ARRAY"
                        , ( "JSON array of objects"
                          , JSON.arraySource
                          , [ "catchphrase"
                            , "relatives.0.name"
                            ]
                          , "json_${name}"
                          )
                        )
                       , ( "JSON-SEQUENCE"
                         , ( "One JSON object per line"
                           , JSON.sequenceSource
                           , [ "author"
                             , "subreddit"
                             ]
                           , "json-lines_${name}"
                           )
                         )
                       , ("CONLL-U"
                         , ( "CONLL-U format"
                           , CONLL.sequenceSource CONLL.ufields
                           , ["sentence"]
                           , "conll_${}"
                           )
                         )
                       , ( "PTB"
                         , ( "PENN Treebank format"
                           , PTB.sequenceSource
                           , ["sentence"] :: [Text]
                           , "ptb_${}" :: Text
                           )
                         )
                       -- , ( "CSV"
                       --   , ( "CSV format (with header, commas)"
                       --     , CSV.sequenceSource True ','
                       --     , [ "technology"
                       --       , "Bush"
                       --       , "Gore"
                       --       ]
                       --     , "csv_${county}"
                       --     )
                       --   )
                       -- , ("HTML"
                       --   , ("HTML format"
                       --     , HTML.sequenceSource
                       --     , []
                       --     , "id_${}"
                       --     )
                       --   )
                       -- , ("XML"
                       --   , ("XML format"
                       --     , XML.sequenceSource
                       --     , []
                       --     , "id_${}"
                       --     )
                       --   )
                       -- , ("Email"
                       --   , ("Email format"
                       --     , Email.sequenceSource
                       --     , []
                       --     , "id_${}"
                       --     )
                       --   )
                       ]

-- | Run CommunicationAction on each entry created during the ingest process
ingest :: (Text -> ConduitM () Communication IO ()) -> (Communication -> IO ()) -> (Communication -> Bool) -> Text -> [Text] -> Text -> Text -> IO ()
ingest src cb filt txt cs idStr ct = do
  runConduit $ src txt .| mergeSource (yieldMany [1..]) .| mapMC (finalizeCommunication idStr []) .| mapMC cb .| sinkNull