aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorJoris2015-08-30 21:17:27 +0200
committerJoris2015-08-30 21:17:27 +0200
commitc4eb771fa09e3972106d80ada6b3c4a023b85249 (patch)
tree9214b64628e34089b73ff6b4bdcb0edbc668a51e /src
Fetch perfumes according to multiple materials
Diffstat (limited to 'src')
-rw-r--r--src/CSV.hs18
-rw-r--r--src/HTTP.hs23
-rw-r--r--src/Main.hs56
-rw-r--r--src/Model/Json/Search.hs27
-rw-r--r--src/Model/Perfume.hs36
-rw-r--r--src/Model/URL.hs22
-rw-r--r--src/PerfumeParser.hs31
7 files changed, 213 insertions, 0 deletions
diff --git a/src/CSV.hs b/src/CSV.hs
new file mode 100644
index 0000000..80c6d76
--- /dev/null
+++ b/src/CSV.hs
@@ -0,0 +1,18 @@
+{-# LANGUAGE OverloadedStrings #-}
+
+module CSV
+ ( getCsv
+ ) where
+
+import Data.Text (Text)
+import qualified Data.Text as T
+
+getCsv :: [[Text]] -> Text
+getCsv = T.intercalate "\n" . map (T.intercalate "," . map quote)
+
+quote :: Text -> Text
+quote text = T.concat [ "\"", T.concatMap escape text, "\"" ]
+
+escape :: Char -> Text
+escape '"' = "\"\""
+escape x = T.singleton x
diff --git a/src/HTTP.hs b/src/HTTP.hs
new file mode 100644
index 0000000..6ba153d
--- /dev/null
+++ b/src/HTTP.hs
@@ -0,0 +1,23 @@
+module HTTP
+ ( getPage
+ ) where
+
+import Control.Exception (SomeException, try)
+import Control.Arrow (left)
+
+import Data.Text (Text)
+import qualified Data.Text as T
+import qualified Data.Text.IO as T
+
+import Network.HTTP (simpleHTTP, getRequest, getResponseBody)
+
+import Model.URL
+
+import Codec.Binary.UTF8.String (decodeString)
+
+getPage :: URL -> IO (Either Text Text)
+getPage url =
+ left (T.pack . show) <$> (try (unsafeGetPage url) :: IO (Either SomeException Text))
+
+unsafeGetPage :: URL -> IO Text
+unsafeGetPage url = simpleHTTP (getRequest (T.unpack url)) >>= (\x -> T.pack . decodeString <$> getResponseBody x)
diff --git a/src/Main.hs b/src/Main.hs
new file mode 100644
index 0000000..cf557a0
--- /dev/null
+++ b/src/Main.hs
@@ -0,0 +1,56 @@
+{-# LANGUAGE OverloadedStrings #-}
+
+module Main
+ ( main
+ ) where
+
+import HTTP
+
+import System.Environment (getArgs, getProgName)
+
+import Data.List (intercalate)
+import Data.Text (Text)
+import qualified Data.Text as T
+import qualified Data.Text.IO as T
+import Data.Text.Encoding (encodeUtf8)
+import Data.Aeson (eitherDecodeStrict)
+
+import Model.URL
+import Model.Json.Search
+import Model.Perfume
+
+import PerfumeParser (parsePerfumes)
+
+main :: IO ()
+main = do
+ args <- getArgs
+ case args of
+ [] -> do
+ progName <- T.pack <$> getProgName
+ T.putStrLn (T.concat ["Usage: ", progName, " material"])
+ materials -> do
+ eitherPages <- fmap sequence <$> sequence . map getHtmlPerfumes . map T.pack $ materials
+ case eitherPages of
+ Left error ->
+ T.putStrLn error
+ Right perfumesSequences ->
+ let commonPerfumes = getCommonPerfumes perfumesSequences
+ outputName = (intercalate "-" materials) ++ ".csv"
+ in T.writeFile outputName (csvPerfumes commonPerfumes)
+
+getHtmlPerfumes :: Text -> IO (Either Text [Perfume])
+getHtmlPerfumes material = do
+ eitherPage <- getPage (getIdURL material)
+ case eitherPage of
+ Left error ->
+ return . Left $ error
+ Right page ->
+ case eitherDecodeStrict (encodeUtf8 page) :: Either String [Search] of
+ Left error ->
+ return . Left . T.pack $ error
+ Right searches ->
+ case getMaterialIdentifier searches of
+ Nothing ->
+ return . Left $ "No material identifier found"
+ Just identifier -> do
+ fmap parsePerfumes <$> getPage (getMaterialURL identifier)
diff --git a/src/Model/Json/Search.hs b/src/Model/Json/Search.hs
new file mode 100644
index 0000000..194a4c8
--- /dev/null
+++ b/src/Model/Json/Search.hs
@@ -0,0 +1,27 @@
+{-# LANGUAGE OverloadedStrings #-}
+
+module Model.Json.Search
+ ( Search(..)
+ , getMaterialIdentifier
+ ) where
+
+import Control.Monad (mzero)
+
+import Data.Aeson
+import Data.Text (Text)
+import Data.List (find)
+
+data Search = Search
+ { identifier :: Text
+ , kind :: Text
+ } deriving (Show)
+
+instance FromJSON Search where
+ parseJSON (Object v) =
+ Search <$>
+ v .: "id" <*>
+ v .: "type"
+ parseJSON _ = mzero
+
+getMaterialIdentifier :: [Search] -> Maybe Text
+getMaterialIdentifier = fmap identifier . find ((==) "matiere" . kind)
diff --git a/src/Model/Perfume.hs b/src/Model/Perfume.hs
new file mode 100644
index 0000000..0f99b55
--- /dev/null
+++ b/src/Model/Perfume.hs
@@ -0,0 +1,36 @@
+{-# LANGUAGE OverloadedStrings #-}
+
+module Model.Perfume
+ ( Perfume(..)
+ , csvPerfumes
+ , getCommonPerfumes
+ ) where
+
+import Data.Text (Text)
+import qualified Data.Text as T
+
+import Model.URL
+
+import CSV
+
+data Perfume = Perfume
+ { name :: Text
+ , url :: URL
+ } deriving (Eq, Show)
+
+csvPerfumes :: [Perfume] -> Text
+csvPerfumes = getCsv . map (\perfume -> [name perfume, url perfume])
+
+getCommonPerfumes :: [[Perfume]] -> [Perfume]
+getCommonPerfumes (perfumes1:perfumesSequences) =
+ filter
+ (\p1 ->
+ all
+ (\perfumes2 ->
+ any
+ (\p2 -> name p1 == name p2)
+ perfumes2
+ )
+ perfumesSequences
+ )
+ perfumes1
diff --git a/src/Model/URL.hs b/src/Model/URL.hs
new file mode 100644
index 0000000..76a9e86
--- /dev/null
+++ b/src/Model/URL.hs
@@ -0,0 +1,22 @@
+{-# LANGUAGE OverloadedStrings #-}
+
+module Model.URL
+ ( URL
+ , getIdURL
+ , getMaterialURL
+ , site
+ ) where
+
+import Data.Text (Text)
+import qualified Data.Text as T
+
+type URL = Text
+
+getIdURL :: Text -> URL
+getIdURL material = T.concat [site, "/ajax/global.php?term=", material]
+
+getMaterialURL :: Text -> URL
+getMaterialURL materialId = T.concat [site, "/fiche-matiere-", materialId, ".php"]
+
+site :: URL
+site = "http://olfatheque.com"
diff --git a/src/PerfumeParser.hs b/src/PerfumeParser.hs
new file mode 100644
index 0000000..1b200d9
--- /dev/null
+++ b/src/PerfumeParser.hs
@@ -0,0 +1,31 @@
+{-# LANGUAGE OverloadedStrings #-}
+
+module PerfumeParser
+ ( parsePerfumes
+ ) where
+
+import Data.Text (Text)
+import qualified Data.Text as T
+import Text.HTML.TagSoup
+import Data.List (find)
+
+import Model.URL
+import Model.Perfume
+
+parsePerfumes :: Text -> [Perfume]
+parsePerfumes page = getPerfumes . getSecondTagsInside "tbody" $ parseTags page
+
+getSecondTagsInside :: String -> [Tag Text] -> [Tag Text]
+getSecondTagsInside selector =
+ takeWhile (~/= ("</" ++ selector ++ ">"))
+ . dropWhile (~/= ("<" ++ selector ++ ">"))
+ . drop 1
+ . dropWhile (~/= ("<" ++ selector ++ ">"))
+
+getPerfumes :: [Tag Text] -> [Perfume]
+getPerfumes (TagOpen "a" attributes : TagText name : xs) =
+ case find ((==) "href" . fst) attributes of
+ Just (_, url) -> Perfume name (T.concat [site, "/", url]) : getPerfumes xs
+ Nothing -> getPerfumes xs
+getPerfumes (_:xs) = getPerfumes xs
+getPerfumes [] = []