From c4eb771fa09e3972106d80ada6b3c4a023b85249 Mon Sep 17 00:00:00 2001 From: Joris Date: Sun, 30 Aug 2015 21:17:27 +0200 Subject: Fetch perfumes according to multiple materials --- src/CSV.hs | 18 ++++++++++++++++ src/HTTP.hs | 23 ++++++++++++++++++++ src/Main.hs | 56 ++++++++++++++++++++++++++++++++++++++++++++++++ src/Model/Json/Search.hs | 27 +++++++++++++++++++++++ src/Model/Perfume.hs | 36 +++++++++++++++++++++++++++++++ src/Model/URL.hs | 22 +++++++++++++++++++ src/PerfumeParser.hs | 31 +++++++++++++++++++++++++++ 7 files changed, 213 insertions(+) create mode 100644 src/CSV.hs create mode 100644 src/HTTP.hs create mode 100644 src/Main.hs create mode 100644 src/Model/Json/Search.hs create mode 100644 src/Model/Perfume.hs create mode 100644 src/Model/URL.hs create mode 100644 src/PerfumeParser.hs (limited to 'src') diff --git a/src/CSV.hs b/src/CSV.hs new file mode 100644 index 0000000..80c6d76 --- /dev/null +++ b/src/CSV.hs @@ -0,0 +1,18 @@ +{-# LANGUAGE OverloadedStrings #-} + +module CSV + ( getCsv + ) where + +import Data.Text (Text) +import qualified Data.Text as T + +getCsv :: [[Text]] -> Text +getCsv = T.intercalate "\n" . map (T.intercalate "," . map quote) + +quote :: Text -> Text +quote text = T.concat [ "\"", T.concatMap escape text, "\"" ] + +escape :: Char -> Text +escape '"' = "\"\"" +escape x = T.singleton x diff --git a/src/HTTP.hs b/src/HTTP.hs new file mode 100644 index 0000000..6ba153d --- /dev/null +++ b/src/HTTP.hs @@ -0,0 +1,23 @@ +module HTTP + ( getPage + ) where + +import Control.Exception (SomeException, try) +import Control.Arrow (left) + +import Data.Text (Text) +import qualified Data.Text as T +import qualified Data.Text.IO as T + +import Network.HTTP (simpleHTTP, getRequest, getResponseBody) + +import Model.URL + +import Codec.Binary.UTF8.String (decodeString) + +getPage :: URL -> IO (Either Text Text) +getPage url = + left (T.pack . show) <$> (try (unsafeGetPage url) :: IO (Either SomeException Text)) + +unsafeGetPage :: URL -> IO Text +unsafeGetPage url = simpleHTTP (getRequest (T.unpack url)) >>= (\x -> T.pack . decodeString <$> getResponseBody x) diff --git a/src/Main.hs b/src/Main.hs new file mode 100644 index 0000000..cf557a0 --- /dev/null +++ b/src/Main.hs @@ -0,0 +1,56 @@ +{-# LANGUAGE OverloadedStrings #-} + +module Main + ( main + ) where + +import HTTP + +import System.Environment (getArgs, getProgName) + +import Data.List (intercalate) +import Data.Text (Text) +import qualified Data.Text as T +import qualified Data.Text.IO as T +import Data.Text.Encoding (encodeUtf8) +import Data.Aeson (eitherDecodeStrict) + +import Model.URL +import Model.Json.Search +import Model.Perfume + +import PerfumeParser (parsePerfumes) + +main :: IO () +main = do + args <- getArgs + case args of + [] -> do + progName <- T.pack <$> getProgName + T.putStrLn (T.concat ["Usage: ", progName, " material"]) + materials -> do + eitherPages <- fmap sequence <$> sequence . map getHtmlPerfumes . map T.pack $ materials + case eitherPages of + Left error -> + T.putStrLn error + Right perfumesSequences -> + let commonPerfumes = getCommonPerfumes perfumesSequences + outputName = (intercalate "-" materials) ++ ".csv" + in T.writeFile outputName (csvPerfumes commonPerfumes) + +getHtmlPerfumes :: Text -> IO (Either Text [Perfume]) +getHtmlPerfumes material = do + eitherPage <- getPage (getIdURL material) + case eitherPage of + Left error -> + return . Left $ error + Right page -> + case eitherDecodeStrict (encodeUtf8 page) :: Either String [Search] of + Left error -> + return . Left . T.pack $ error + Right searches -> + case getMaterialIdentifier searches of + Nothing -> + return . Left $ "No material identifier found" + Just identifier -> do + fmap parsePerfumes <$> getPage (getMaterialURL identifier) diff --git a/src/Model/Json/Search.hs b/src/Model/Json/Search.hs new file mode 100644 index 0000000..194a4c8 --- /dev/null +++ b/src/Model/Json/Search.hs @@ -0,0 +1,27 @@ +{-# LANGUAGE OverloadedStrings #-} + +module Model.Json.Search + ( Search(..) + , getMaterialIdentifier + ) where + +import Control.Monad (mzero) + +import Data.Aeson +import Data.Text (Text) +import Data.List (find) + +data Search = Search + { identifier :: Text + , kind :: Text + } deriving (Show) + +instance FromJSON Search where + parseJSON (Object v) = + Search <$> + v .: "id" <*> + v .: "type" + parseJSON _ = mzero + +getMaterialIdentifier :: [Search] -> Maybe Text +getMaterialIdentifier = fmap identifier . find ((==) "matiere" . kind) diff --git a/src/Model/Perfume.hs b/src/Model/Perfume.hs new file mode 100644 index 0000000..0f99b55 --- /dev/null +++ b/src/Model/Perfume.hs @@ -0,0 +1,36 @@ +{-# LANGUAGE OverloadedStrings #-} + +module Model.Perfume + ( Perfume(..) + , csvPerfumes + , getCommonPerfumes + ) where + +import Data.Text (Text) +import qualified Data.Text as T + +import Model.URL + +import CSV + +data Perfume = Perfume + { name :: Text + , url :: URL + } deriving (Eq, Show) + +csvPerfumes :: [Perfume] -> Text +csvPerfumes = getCsv . map (\perfume -> [name perfume, url perfume]) + +getCommonPerfumes :: [[Perfume]] -> [Perfume] +getCommonPerfumes (perfumes1:perfumesSequences) = + filter + (\p1 -> + all + (\perfumes2 -> + any + (\p2 -> name p1 == name p2) + perfumes2 + ) + perfumesSequences + ) + perfumes1 diff --git a/src/Model/URL.hs b/src/Model/URL.hs new file mode 100644 index 0000000..76a9e86 --- /dev/null +++ b/src/Model/URL.hs @@ -0,0 +1,22 @@ +{-# LANGUAGE OverloadedStrings #-} + +module Model.URL + ( URL + , getIdURL + , getMaterialURL + , site + ) where + +import Data.Text (Text) +import qualified Data.Text as T + +type URL = Text + +getIdURL :: Text -> URL +getIdURL material = T.concat [site, "/ajax/global.php?term=", material] + +getMaterialURL :: Text -> URL +getMaterialURL materialId = T.concat [site, "/fiche-matiere-", materialId, ".php"] + +site :: URL +site = "http://olfatheque.com" diff --git a/src/PerfumeParser.hs b/src/PerfumeParser.hs new file mode 100644 index 0000000..1b200d9 --- /dev/null +++ b/src/PerfumeParser.hs @@ -0,0 +1,31 @@ +{-# LANGUAGE OverloadedStrings #-} + +module PerfumeParser + ( parsePerfumes + ) where + +import Data.Text (Text) +import qualified Data.Text as T +import Text.HTML.TagSoup +import Data.List (find) + +import Model.URL +import Model.Perfume + +parsePerfumes :: Text -> [Perfume] +parsePerfumes page = getPerfumes . getSecondTagsInside "tbody" $ parseTags page + +getSecondTagsInside :: String -> [Tag Text] -> [Tag Text] +getSecondTagsInside selector = + takeWhile (~/= ("")) + . dropWhile (~/= ("<" ++ selector ++ ">")) + . drop 1 + . dropWhile (~/= ("<" ++ selector ++ ">")) + +getPerfumes :: [Tag Text] -> [Perfume] +getPerfumes (TagOpen "a" attributes : TagText name : xs) = + case find ((==) "href" . fst) attributes of + Just (_, url) -> Perfume name (T.concat [site, "/", url]) : getPerfumes xs + Nothing -> getPerfumes xs +getPerfumes (_:xs) = getPerfumes xs +getPerfumes [] = [] -- cgit v1.2.3