diff options
author | Joris | 2018-06-19 22:49:16 +0200 |
---|---|---|
committer | Joris | 2018-06-19 22:49:16 +0200 |
commit | 149a0470b73781022e584aaeaa7ce871d6f4173b (patch) | |
tree | c1cc762e105ae19e7f3daaa3e9279a467dbaa3dc /src/lib/haskell | |
parent | 5d921c9a2b0a7a8f1a1bb5642cbefa516cbbe4cc (diff) |
Add automatic tests on remote pages
Diffstat (limited to 'src/lib/haskell')
-rw-r--r-- | src/lib/haskell/FetchAd.hs | 36 | ||||
-rw-r--r-- | src/lib/haskell/Model/Ad.hs | 22 | ||||
-rw-r--r-- | src/lib/haskell/Model/URL.hs | 7 | ||||
-rw-r--r-- | src/lib/haskell/Parser/LeboncoinParser.hs | 24 | ||||
-rw-r--r-- | src/lib/haskell/Parser/OuestFranceParser.hs | 25 | ||||
-rw-r--r-- | src/lib/haskell/Parser/SeLogerParser.hs | 24 | ||||
-rw-r--r-- | src/lib/haskell/Parser/Utils.hs | 48 | ||||
-rw-r--r-- | src/lib/haskell/Utils/HTTP.hs | 22 |
8 files changed, 208 insertions, 0 deletions
diff --git a/src/lib/haskell/FetchAd.hs b/src/lib/haskell/FetchAd.hs new file mode 100644 index 0000000..a206181 --- /dev/null +++ b/src/lib/haskell/FetchAd.hs @@ -0,0 +1,36 @@ +module FetchAd + ( leboncoin + , ouestFrance + , seLoger + ) where + +import Data.Either (rights) +import Data.Text.Encoding as T + +import Model.Ad (Ad) +import Model.URL (URL) +import qualified Parser.LeboncoinParser as LeboncoinParser +import qualified Parser.OuestFranceParser as OuestFranceParser +import qualified Parser.SeLogerParser as SeLogerParser +import qualified Utils.HTTP as HTTP + +leboncoin :: [URL] -> IO [Ad] +leboncoin urls = + fmap (concat . map LeboncoinParser.parse . rights) + . sequence + . map (HTTP.get T.decodeLatin1) + $ urls + +ouestFrance :: [URL] -> IO [Ad] +ouestFrance urls = + fmap (concat . map OuestFranceParser.parse . rights) + . sequence + . map (HTTP.get T.decodeUtf8) + $ urls + +seLoger :: [URL] -> IO [Ad] +seLoger urls = + fmap (concat . map SeLogerParser.parse . rights) + . sequence + . map (HTTP.get T.decodeUtf8) + $ urls diff --git a/src/lib/haskell/Model/Ad.hs b/src/lib/haskell/Model/Ad.hs new file mode 100644 index 0000000..06906eb --- /dev/null +++ b/src/lib/haskell/Model/Ad.hs @@ -0,0 +1,22 @@ +module Model.Ad + ( Ad(..) + , getNewAds + ) where + +import Data.List ((\\)) +import Data.Text (Text) + +import Model.URL (URL) + +data Ad = Ad + { name :: Text + , location :: Text + , price :: Maybe Text + , url :: URL + } deriving (Eq, Read, Show) + +getNewAds :: [URL] -> [Ad] -> ([URL], [Ad]) +getNewAds viewdURLs ads = + let newURLs = (map url ads) \\ viewdURLs + newAds = filter (\ad -> elem (url ad) newURLs) ads + in (newURLs, newAds) diff --git a/src/lib/haskell/Model/URL.hs b/src/lib/haskell/Model/URL.hs new file mode 100644 index 0000000..2114113 --- /dev/null +++ b/src/lib/haskell/Model/URL.hs @@ -0,0 +1,7 @@ +module Model.URL + ( URL + ) where + +import Data.Text + +type URL = Text diff --git a/src/lib/haskell/Parser/LeboncoinParser.hs b/src/lib/haskell/Parser/LeboncoinParser.hs new file mode 100644 index 0000000..77213cb --- /dev/null +++ b/src/lib/haskell/Parser/LeboncoinParser.hs @@ -0,0 +1,24 @@ +module Parser.LeboncoinParser + ( parse + ) where + +import Data.Maybe (catMaybes) +import Data.Text (Text) +import qualified Data.Text as T +import Text.HTML.TagSoup + +import Model.Ad (Ad (Ad)) +import Parser.Utils + +parse :: Text -> [Ad] +parse page = + catMaybes . fmap parseAd $ partitions (~== (T.unpack "<a>")) tags + where tags = getTagsBetween "<li itemtype=http://schema.org/Offer>" "<div class=information-immo_content>" (parseTags page) + +parseAd :: [Tag Text] -> Maybe Ad +parseAd tags = do + name <- getTagTextAfter "<h2 class=item_title>" tags + location <- getTagAttribute "<meta itemprop=address>" "content" tags + let price = getTagTextAfter "<h3 class=item_price>" tags + url <- getTagAttribute "<a>" "href" tags + return (Ad name location price (T.concat ["https:", url])) diff --git a/src/lib/haskell/Parser/OuestFranceParser.hs b/src/lib/haskell/Parser/OuestFranceParser.hs new file mode 100644 index 0000000..f46ed03 --- /dev/null +++ b/src/lib/haskell/Parser/OuestFranceParser.hs @@ -0,0 +1,25 @@ +module Parser.OuestFranceParser + ( parse + ) where + +import Data.Maybe (catMaybes) +import Data.Text (Text) +import qualified Data.Text as T +import Text.HTML.TagSoup + +import Model.Ad (Ad (Ad)) +import Parser.Utils + +parse :: Text -> [Ad] +parse page = + catMaybes . fmap parseAd $ partitions (~== (T.unpack "<a>")) tags + where tags = getTagsBetween "<div id=listAnnonces>" "<div id=interactions>" (parseTags page) + +parseAd :: [Tag Text] -> Maybe Ad +parseAd tags = do + name <- getTagTextAfter "<span class=annTitre>" tags + location <- getTagTextAfter "<span class=annVille>" tags + let price = getTagTextAfter "<span class=annPrix>" tags + let startUrl = "https://www.ouestfrance-immo.com/" + url <- getTagAttribute "<a>" "href" tags + return (Ad name location price (T.concat [startUrl, url])) diff --git a/src/lib/haskell/Parser/SeLogerParser.hs b/src/lib/haskell/Parser/SeLogerParser.hs new file mode 100644 index 0000000..b073862 --- /dev/null +++ b/src/lib/haskell/Parser/SeLogerParser.hs @@ -0,0 +1,24 @@ +module Parser.SeLogerParser + ( parse + ) where + +import Data.Maybe (catMaybes) +import Data.Text (Text) +import qualified Data.Text as T +import Text.HTML.TagSoup + +import Model.Ad (Ad (Ad)) +import Parser.Utils + +parse :: Text -> [Ad] +parse page = + catMaybes . fmap parseAd $ partitions (~== (T.unpack "<div class=c-pa-info>")) tags + where tags = getTagsBetween "<section class=liste_resultat>" "<div class=bottomAnchorWrapper>" (parseTags page) + +parseAd :: [Tag Text] -> Maybe Ad +parseAd tags = do + name <- getTagTextAfter "<a>" tags + location <- getTagTextAfter "<div class=c-pa-city>" tags + let price = getTagTextAfter "<span class=c-pa-cprice>" tags + url <- getTagAttribute "<a>" "href" tags + return (Ad name location price url) diff --git a/src/lib/haskell/Parser/Utils.hs b/src/lib/haskell/Parser/Utils.hs new file mode 100644 index 0000000..4768327 --- /dev/null +++ b/src/lib/haskell/Parser/Utils.hs @@ -0,0 +1,48 @@ +module Parser.Utils + ( getTagsBefore + , getTagsAfter + , getTagsBetween + , getTagAttributes + , getTagAttribute + , getTagTextAfter + ) where + +import Data.List (find, findIndex) +import Data.Maybe (catMaybes, listToMaybe) +import Data.Text (Text) +import qualified Data.Text as T +import Text.HTML.TagSoup + +getTagsBefore :: Text -> [Tag Text] -> [Tag Text] +getTagsBefore selector = takeWhile (~/= (T.unpack selector)) + +getTagsAfter :: Text -> [Tag Text] -> [Tag Text] +getTagsAfter selector = drop 1 . dropWhile (~/= (T.unpack selector)) + +getTagsBetween :: Text -> Text -> [Tag Text] -> [Tag Text] +getTagsBetween begin end = getTagsBefore end . getTagsAfter begin + +getTagAttributes :: Text -> Text -> [Tag Text] -> [Text] +getTagAttributes selector attribute = + catMaybes + . fmap (maybeTagAttribute attribute) + . filter (~== (T.unpack selector)) + +getTagAttribute :: Text -> Text -> [Tag Text] -> Maybe Text +getTagAttribute selector attribute = + listToMaybe + . getTagAttributes selector attribute + +getTagTextAfter :: Text -> [Tag Text] -> Maybe Text +getTagTextAfter selector tags = + case findIndex (~== (T.unpack selector)) tags of + Just index -> fmap T.strip $ safeGetAt (index + 1) tags >>= maybeTagText + Nothing -> Nothing + +maybeTagAttribute :: Text -> Tag Text -> Maybe Text +maybeTagAttribute name (TagOpen _ xs) = + fmap snd . find (\(x, _) -> x == name) $ xs +maybeTagAttribute _ _ = Nothing + +safeGetAt :: Int -> [a] -> Maybe a +safeGetAt index = listToMaybe . drop index diff --git a/src/lib/haskell/Utils/HTTP.hs b/src/lib/haskell/Utils/HTTP.hs new file mode 100644 index 0000000..87635ce --- /dev/null +++ b/src/lib/haskell/Utils/HTTP.hs @@ -0,0 +1,22 @@ +module Utils.HTTP + ( get + ) where + +import Control.Exception (SomeException, try) +import Data.ByteString (ByteString) +import qualified Data.ByteString.Lazy as BS +import Data.Text (Text) +import qualified Data.Text as T +import Network.HTTP.Conduit + +import Model.URL + +get :: (ByteString -> Text) -> URL -> IO (Either Text Text) +get decode url = mapLeft (T.pack . show) <$> (try (unsafeGetPage decode url) :: IO (Either SomeException Text)) + +unsafeGetPage :: (ByteString -> Text) -> URL -> IO Text +unsafeGetPage decode url = (decode . BS.toStrict) <$> simpleHttp (T.unpack url) + +mapLeft :: (a -> c) -> Either a b -> Either c b +mapLeft f (Left l) = Left (f l) +mapLeft _ (Right r) = (Right r) |