From c4eb771fa09e3972106d80ada6b3c4a023b85249 Mon Sep 17 00:00:00 2001 From: Joris Date: Sun, 30 Aug 2015 21:17:27 +0200 Subject: Fetch perfumes according to multiple materials --- src/PerfumeParser.hs | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 src/PerfumeParser.hs (limited to 'src/PerfumeParser.hs') diff --git a/src/PerfumeParser.hs b/src/PerfumeParser.hs new file mode 100644 index 0000000..1b200d9 --- /dev/null +++ b/src/PerfumeParser.hs @@ -0,0 +1,31 @@ +{-# LANGUAGE OverloadedStrings #-} + +module PerfumeParser + ( parsePerfumes + ) where + +import Data.Text (Text) +import qualified Data.Text as T +import Text.HTML.TagSoup +import Data.List (find) + +import Model.URL +import Model.Perfume + +parsePerfumes :: Text -> [Perfume] +parsePerfumes page = getPerfumes . getSecondTagsInside "tbody" $ parseTags page + +getSecondTagsInside :: String -> [Tag Text] -> [Tag Text] +getSecondTagsInside selector = + takeWhile (~/= ("")) + . dropWhile (~/= ("<" ++ selector ++ ">")) + . drop 1 + . dropWhile (~/= ("<" ++ selector ++ ">")) + +getPerfumes :: [Tag Text] -> [Perfume] +getPerfumes (TagOpen "a" attributes : TagText name : xs) = + case find ((==) "href" . fst) attributes of + Just (_, url) -> Perfume name (T.concat [site, "/", url]) : getPerfumes xs + Nothing -> getPerfumes xs +getPerfumes (_:xs) = getPerfumes xs +getPerfumes [] = [] -- cgit v1.2.3