From 0a4d3c8f12dc5797a919a00b6bcaf759947687cc Mon Sep 17 00:00:00 2001 From: Joris Date: Sun, 17 Jun 2018 23:24:47 +0200 Subject: Add ouest france parser --- src/parser/haskell/Model/Ad.hs | 22 ++++++++++++ src/parser/haskell/Model/URL.hs | 7 ++++ src/parser/haskell/Parser/LeboncoinParser.hs | 24 +++++++++++++ src/parser/haskell/Parser/OuestFranceParser.hs | 27 ++++++++++++++ src/parser/haskell/Parser/Utils.hs | 49 ++++++++++++++++++++++++++ 5 files changed, 129 insertions(+) create mode 100644 src/parser/haskell/Model/Ad.hs create mode 100644 src/parser/haskell/Model/URL.hs create mode 100644 src/parser/haskell/Parser/LeboncoinParser.hs create mode 100644 src/parser/haskell/Parser/OuestFranceParser.hs create mode 100644 src/parser/haskell/Parser/Utils.hs (limited to 'src/parser') diff --git a/src/parser/haskell/Model/Ad.hs b/src/parser/haskell/Model/Ad.hs new file mode 100644 index 0000000..06906eb --- /dev/null +++ b/src/parser/haskell/Model/Ad.hs @@ -0,0 +1,22 @@ +module Model.Ad + ( Ad(..) + , getNewAds + ) where + +import Data.List ((\\)) +import Data.Text (Text) + +import Model.URL (URL) + +data Ad = Ad + { name :: Text + , location :: Text + , price :: Maybe Text + , url :: URL + } deriving (Eq, Read, Show) + +getNewAds :: [URL] -> [Ad] -> ([URL], [Ad]) +getNewAds viewdURLs ads = + let newURLs = (map url ads) \\ viewdURLs + newAds = filter (\ad -> elem (url ad) newURLs) ads + in (newURLs, newAds) diff --git a/src/parser/haskell/Model/URL.hs b/src/parser/haskell/Model/URL.hs new file mode 100644 index 0000000..2114113 --- /dev/null +++ b/src/parser/haskell/Model/URL.hs @@ -0,0 +1,7 @@ +module Model.URL + ( URL + ) where + +import Data.Text + +type URL = Text diff --git a/src/parser/haskell/Parser/LeboncoinParser.hs b/src/parser/haskell/Parser/LeboncoinParser.hs new file mode 100644 index 0000000..48eb80f --- /dev/null +++ b/src/parser/haskell/Parser/LeboncoinParser.hs @@ -0,0 +1,24 @@ +module Parser.LeboncoinParser + ( parse + ) where + +import Data.Maybe (catMaybes) +import Data.Text (Text) +import qualified Data.Text as T +import Text.HTML.TagSoup + +import Model.Ad (Ad (Ad)) +import Parser.Utils + +parse :: Text -> [Ad] +parse page = + catMaybes . fmap parseAd $ partitions (~== "") tags + where tags = getTagsBetween "
  • " "
    " (parseTags page) + +parseAd :: [Tag Text] -> Maybe Ad +parseAd tags = do + name <- getTagTextAfter "

    " tags + location <- getTagAttribute "" (T.pack "content") tags + let price = getTagTextAfter "

    " tags + url <- getTagAttribute "" (T.pack "href") tags + return (Ad name location price (T.concat [T.pack "https:", url])) diff --git a/src/parser/haskell/Parser/OuestFranceParser.hs b/src/parser/haskell/Parser/OuestFranceParser.hs new file mode 100644 index 0000000..a7b6360 --- /dev/null +++ b/src/parser/haskell/Parser/OuestFranceParser.hs @@ -0,0 +1,27 @@ +module Parser.OuestFranceParser + ( parse + ) where + +import Data.Maybe (catMaybes) +import Data.Text (Text) +import qualified Data.Text as T + +import Text.HTML.TagSoup + +import Model.Ad (Ad (Ad)) + +import Parser.Utils + +parse :: Text -> [Ad] +parse page = + catMaybes . fmap parseAd $ partitions (~== "") tags + where tags = getTagsBetween "
    " "
    " (parseTags page) + +parseAd :: [Tag Text] -> Maybe Ad +parseAd tags = do + name <- getTagTextAfter "" tags + location <- getTagTextAfter "" tags + let price = getTagTextAfter "" tags + let startUrl = T.pack "https://www.ouestfrance-immo.com/" + url <- getTagAttribute "" (T.pack "href") tags + return (Ad name location price (T.concat [startUrl, url])) diff --git a/src/parser/haskell/Parser/Utils.hs b/src/parser/haskell/Parser/Utils.hs new file mode 100644 index 0000000..7c433c6 --- /dev/null +++ b/src/parser/haskell/Parser/Utils.hs @@ -0,0 +1,49 @@ +module Parser.Utils + ( getTagsBefore + , getTagsAfter + , getTagsBetween + , getTagAttributes + , getTagAttribute + , getTagTextAfter + ) where + +import Data.List (find, findIndex) +import Data.Maybe (catMaybes, listToMaybe) +import Data.Text (Text) +import qualified Data.Text as T + +import Text.HTML.TagSoup + +getTagsBefore :: String -> [Tag Text] -> [Tag Text] +getTagsBefore selector = takeWhile (~/= selector) + +getTagsAfter :: String -> [Tag Text] -> [Tag Text] +getTagsAfter selector = drop 1 . dropWhile (~/= selector) + +getTagsBetween :: String -> String -> [Tag Text] -> [Tag Text] +getTagsBetween begin end = getTagsBefore end . getTagsAfter begin + +getTagAttributes :: String -> Text -> [Tag Text] -> [Text] +getTagAttributes selector attribute = + catMaybes + . fmap (maybeTagAttribute attribute) + . filter (~== selector) + +getTagAttribute :: String -> Text -> [Tag Text] -> Maybe Text +getTagAttribute selector attribute = + listToMaybe + . getTagAttributes selector attribute + +getTagTextAfter :: String -> [Tag Text] -> Maybe Text +getTagTextAfter selector tags = + case findIndex (~== selector) tags of + Just index -> fmap T.strip $ safeGetAt (index + 1) tags >>= maybeTagText + Nothing -> Nothing + +maybeTagAttribute :: Text -> Tag Text -> Maybe Text +maybeTagAttribute name (TagOpen _ xs) = + fmap snd . find (\(x, _) -> x == name) $ xs +maybeTagAttribute _ _ = Nothing + +safeGetAt :: Int -> [a] -> Maybe a +safeGetAt index = listToMaybe . drop index -- cgit v1.2.3