From 5977e1454d7738ddb086d37b20337e350e380790 Mon Sep 17 00:00:00 2001 From: Joris Guyonvarch Date: Sat, 11 Apr 2015 11:50:48 +0200 Subject: Fetch first page ads of a given leboncoin url, fetch also the description page of each item. --- src/Ad.hs | 33 +++++++++++++++++++++++++++++++++ src/Main.hs | 27 +++++++++++++++++++++++++++ src/Model/Ad.hs | 11 +++++++++++ src/Model/Detail.hs | 7 +++++++ src/Model/Resume.hs | 9 +++++++++ src/Page.hs | 15 +++++++++++++++ src/Parser/Detail.hs | 14 ++++++++++++++ src/Parser/Resume.hs | 27 +++++++++++++++++++++++++++ src/Parser/Utils.hs | 30 ++++++++++++++++++++++++++++++ src/Utils/StringFormat.hs | 6 ++++++ src/View/Ad.hs | 28 ++++++++++++++++++++++++++++ 11 files changed, 207 insertions(+) create mode 100644 src/Ad.hs create mode 100644 src/Main.hs create mode 100644 src/Model/Ad.hs create mode 100644 src/Model/Detail.hs create mode 100644 src/Model/Resume.hs create mode 100644 src/Page.hs create mode 100644 src/Parser/Detail.hs create mode 100644 src/Parser/Resume.hs create mode 100644 src/Parser/Utils.hs create mode 100644 src/Utils/StringFormat.hs create mode 100644 src/View/Ad.hs (limited to 'src') diff --git a/src/Ad.hs b/src/Ad.hs new file mode 100644 index 0000000..0fa131c --- /dev/null +++ b/src/Ad.hs @@ -0,0 +1,33 @@ +module Ad + ( getAds + ) where + +import Text.HTML.TagSoup (parseTags) + +import Page (getPage) + +import Model.Ad +import Model.Resume +import Model.Detail + +import Parser.Resume +import Parser.Detail + +getAds :: String -> IO (Either String [Ad]) +getAds url = do + eitherPage <- getPage url + case eitherPage of + Left error -> + return (Left error) + Right page -> + getAdsFromPage page + +getAdsFromPage :: String -> IO (Either String [Ad]) +getAdsFromPage page = do + xs <- sequence $ map getAd (parseResumes page) + return $ sequence xs + +getAd :: Resume -> IO (Either String Ad) +getAd resume = do + page <- getPage (url resume) + fmap (\page -> Ad { resume = resume, detail = parseDetail (parseTags page)}) <$> getPage (url resume) diff --git a/src/Main.hs b/src/Main.hs new file mode 100644 index 0000000..f352a43 --- /dev/null +++ b/src/Main.hs @@ -0,0 +1,27 @@ +module Main + ( main + ) where + +import Data.List (intersperse) + +import Ad (getAds) + +import Model.Ad + +import View.Ad (renderAd) + +import Page +import Parser.Detail + +url :: String +url = "http://www.leboncoin.fr/annonces/offres/corse/" + +main :: IO () +main = do + eitherAds <- getAds url + case eitherAds of + Left error -> + putStrLn error + Right ads -> do + writeFile "result.html" (concat . intersperse "\n\n" . map renderAd $ ads) + putStrLn "Done!" diff --git a/src/Model/Ad.hs b/src/Model/Ad.hs new file mode 100644 index 0000000..fe650a6 --- /dev/null +++ b/src/Model/Ad.hs @@ -0,0 +1,11 @@ +module Model.Ad + ( Ad(..) + ) where + +import Model.Resume +import Model.Detail + +data Ad = Ad + { resume :: Resume + , detail :: Detail + } deriving (Eq, Read, Show) diff --git a/src/Model/Detail.hs b/src/Model/Detail.hs new file mode 100644 index 0000000..f00a7eb --- /dev/null +++ b/src/Model/Detail.hs @@ -0,0 +1,7 @@ +module Model.Detail + ( Detail(..) + ) where + +data Detail = Detail + { description :: Maybe String + } deriving (Eq, Read, Show) diff --git a/src/Model/Resume.hs b/src/Model/Resume.hs new file mode 100644 index 0000000..1b73baf --- /dev/null +++ b/src/Model/Resume.hs @@ -0,0 +1,9 @@ +module Model.Resume + ( Resume(..) + ) where + +data Resume = Resume + { name :: String + , price :: Maybe String + , url :: String + } deriving (Eq, Read, Show) diff --git a/src/Page.hs b/src/Page.hs new file mode 100644 index 0000000..b70db70 --- /dev/null +++ b/src/Page.hs @@ -0,0 +1,15 @@ +module Page + ( getPage + ) where + +import Control.Exception (SomeException, try) + +import Network.HTTP (simpleHTTP, getRequest, getResponseBody) + +getPage :: String -> IO (Either String String) +getPage url = + mapLeft show <$> (try (simpleHTTP (getRequest url) >>= getResponseBody) :: IO (Either SomeException String)) + +mapLeft :: (a -> c) -> Either a b -> Either c b +mapLeft f (Left l) = Left (f l) +mapLeft _ (Right r) = (Right r) diff --git a/src/Parser/Detail.hs b/src/Parser/Detail.hs new file mode 100644 index 0000000..031d740 --- /dev/null +++ b/src/Parser/Detail.hs @@ -0,0 +1,14 @@ +module Parser.Detail + ( parseDetail + ) where + +import Text.HTML.TagSoup + +import Model.Detail + +import Parser.Utils + +parseDetail :: [Tag String] -> Detail +parseDetail tags = + let description = getTagText "
" tags + in Detail { description = description } diff --git a/src/Parser/Resume.hs b/src/Parser/Resume.hs new file mode 100644 index 0000000..bd73912 --- /dev/null +++ b/src/Parser/Resume.hs @@ -0,0 +1,27 @@ +module Parser.Resume + ( parseResumes + ) where + +import Data.Maybe (catMaybes) + +import Text.HTML.TagSoup + +import Model.Resume + +import Parser.Utils + +parseResumes :: String -> [Resume] +parseResumes page = + case sections (~== "
") (parseTags page) of + [] -> + [] + sectionTags : _ -> + let lbcTags = takeWhile (~/= "
") sectionTags + in catMaybes . fmap parseResume $ partitions (~== "") lbcTags + +parseResume :: [Tag String] -> Maybe Resume +parseResume item = do + name <- getTagText "

" item + let price = getTagText "
" item + url <- getTagAttribute "" "href" item + return Resume { name = name, price = price, url = url } diff --git a/src/Parser/Utils.hs b/src/Parser/Utils.hs new file mode 100644 index 0000000..4864e00 --- /dev/null +++ b/src/Parser/Utils.hs @@ -0,0 +1,30 @@ +module Parser.Utils + ( getTagAttribute + , getTagText + ) where + +import Data.List (find, findIndex) +import Data.Maybe (listToMaybe) + +import Text.HTML.TagSoup + +getTagAttribute :: String -> String -> [Tag String] -> Maybe String +getTagAttribute selector attribute item = + find (~== selector) item >>= maybeTagAttribute attribute + +getTagText :: String -> [Tag String] -> Maybe String +getTagText selector item = + case findIndex (~== selector) item of + Just index -> fmap trim $ safeGetAt (index + 1) item >>= maybeTagText + Nothing -> Nothing + +maybeTagAttribute :: String -> Tag String -> Maybe String +maybeTagAttribute name (TagOpen _ xs) = + fmap snd . find (\(x, _) -> x == name) $ xs +maybeTagAttribute attribute _ = Nothing + +trim :: String -> String +trim = unwords . words + +safeGetAt :: Int -> [a] -> Maybe a +safeGetAt index = listToMaybe . drop index diff --git a/src/Utils/StringFormat.hs b/src/Utils/StringFormat.hs new file mode 100644 index 0000000..757f0e6 --- /dev/null +++ b/src/Utils/StringFormat.hs @@ -0,0 +1,6 @@ +module Utils.StringFormat + ( formatParagraph + ) where + +formatParagraph : String -> String +formatParagraph = id diff --git a/src/View/Ad.hs b/src/View/Ad.hs new file mode 100644 index 0000000..445ef3a --- /dev/null +++ b/src/View/Ad.hs @@ -0,0 +1,28 @@ +module View.Ad + ( renderAd + ) where + +import Data.List (intersperse) +import Data.Maybe (fromMaybe) + +import Model.Ad (Ad(..)) +import qualified Model.Ad as Ad +import Model.Resume (Resume(..)) +import qualified Model.Resume as Resume +import Model.Detail (Detail(..)) +import qualified Model.Detail as Detail + +renderAd :: Ad -> String +renderAd ad = + (renderResume (Ad.resume ad)) ++ "\n\n" ++ (renderDetail (Ad.detail ad)) ++ "\n" + +renderResume :: Resume -> String +renderResume resume = + let formatPrice price = " - " ++ price + price = fromMaybe "" . fmap formatPrice . Resume.price $ resume + titleLine = (Resume.name resume) ++ price + in concat . intersperse "\n" $ [titleLine, Resume.url resume] + +renderDetail :: Detail -> String +renderDetail detail = + fromMaybe "−" (Detail.description detail) -- cgit v1.2.3