diff options
author | Joris Guyonvarch | 2015-04-11 11:50:48 +0200 |
---|---|---|
committer | Joris Guyonvarch | 2015-04-11 12:03:22 +0200 |
commit | 5977e1454d7738ddb086d37b20337e350e380790 (patch) | |
tree | e03261144e3d534434242c1dd037c2a4e4db5a9f |
Fetch first page ads of a given leboncoin url, fetch also the description page of each item.
-rw-r--r-- | .gitignore | 4 | ||||
-rwxr-xr-x | gen | 2 | ||||
-rw-r--r-- | leboncoin-listener.cabal | 11 | ||||
-rw-r--r-- | src/Ad.hs | 33 | ||||
-rw-r--r-- | src/Main.hs | 27 | ||||
-rw-r--r-- | src/Model/Ad.hs | 11 | ||||
-rw-r--r-- | src/Model/Detail.hs | 7 | ||||
-rw-r--r-- | src/Model/Resume.hs | 9 | ||||
-rw-r--r-- | src/Page.hs | 15 | ||||
-rw-r--r-- | src/Parser/Detail.hs | 14 | ||||
-rw-r--r-- | src/Parser/Resume.hs | 27 | ||||
-rw-r--r-- | src/Parser/Utils.hs | 30 | ||||
-rw-r--r-- | src/Utils/StringFormat.hs | 6 | ||||
-rw-r--r-- | src/View/Ad.hs | 28 |
14 files changed, 224 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..54228d1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.cabal-sandbox +cabal.sandbox.config +dist +result.html @@ -0,0 +1,2 @@ +#!/bin/bash +cabal build && ./dist/build/leboncoin/leboncoin diff --git a/leboncoin-listener.cabal b/leboncoin-listener.cabal new file mode 100644 index 0000000..b75de6e --- /dev/null +++ b/leboncoin-listener.cabal @@ -0,0 +1,11 @@ +name: leboncoin-listener +version: 0.0.1 +build-type: Simple +cabal-version: >= 1.8 + +executable leboncoin-listener + main-is: Main.hs + hs-source-dirs: src + build-depends: base + , HTTP == 4000.2.19 + , tagsoup == 0.13.3 diff --git a/src/Ad.hs b/src/Ad.hs new file mode 100644 index 0000000..0fa131c --- /dev/null +++ b/src/Ad.hs @@ -0,0 +1,33 @@ +module Ad + ( getAds + ) where + +import Text.HTML.TagSoup (parseTags) + +import Page (getPage) + +import Model.Ad +import Model.Resume +import Model.Detail + +import Parser.Resume +import Parser.Detail + +getAds :: String -> IO (Either String [Ad]) +getAds url = do + eitherPage <- getPage url + case eitherPage of + Left error -> + return (Left error) + Right page -> + getAdsFromPage page + +getAdsFromPage :: String -> IO (Either String [Ad]) +getAdsFromPage page = do + xs <- sequence $ map getAd (parseResumes page) + return $ sequence xs + +getAd :: Resume -> IO (Either String Ad) +getAd resume = do + page <- getPage (url resume) + fmap (\page -> Ad { resume = resume, detail = parseDetail (parseTags page)}) <$> getPage (url resume) diff --git a/src/Main.hs b/src/Main.hs new file mode 100644 index 0000000..f352a43 --- /dev/null +++ b/src/Main.hs @@ -0,0 +1,27 @@ +module Main + ( main + ) where + +import Data.List (intersperse) + +import Ad (getAds) + +import Model.Ad + +import View.Ad (renderAd) + +import Page +import Parser.Detail + +url :: String +url = "http://www.leboncoin.fr/annonces/offres/corse/" + +main :: IO () +main = do + eitherAds <- getAds url + case eitherAds of + Left error -> + putStrLn error + Right ads -> do + writeFile "result.html" (concat . intersperse "\n\n" . map renderAd $ ads) + putStrLn "Done!" diff --git a/src/Model/Ad.hs b/src/Model/Ad.hs new file mode 100644 index 0000000..fe650a6 --- /dev/null +++ b/src/Model/Ad.hs @@ -0,0 +1,11 @@ +module Model.Ad + ( Ad(..) + ) where + +import Model.Resume +import Model.Detail + +data Ad = Ad + { resume :: Resume + , detail :: Detail + } deriving (Eq, Read, Show) diff --git a/src/Model/Detail.hs b/src/Model/Detail.hs new file mode 100644 index 0000000..f00a7eb --- /dev/null +++ b/src/Model/Detail.hs @@ -0,0 +1,7 @@ +module Model.Detail + ( Detail(..) + ) where + +data Detail = Detail + { description :: Maybe String + } deriving (Eq, Read, Show) diff --git a/src/Model/Resume.hs b/src/Model/Resume.hs new file mode 100644 index 0000000..1b73baf --- /dev/null +++ b/src/Model/Resume.hs @@ -0,0 +1,9 @@ +module Model.Resume + ( Resume(..) + ) where + +data Resume = Resume + { name :: String + , price :: Maybe String + , url :: String + } deriving (Eq, Read, Show) diff --git a/src/Page.hs b/src/Page.hs new file mode 100644 index 0000000..b70db70 --- /dev/null +++ b/src/Page.hs @@ -0,0 +1,15 @@ +module Page + ( getPage + ) where + +import Control.Exception (SomeException, try) + +import Network.HTTP (simpleHTTP, getRequest, getResponseBody) + +getPage :: String -> IO (Either String String) +getPage url = + mapLeft show <$> (try (simpleHTTP (getRequest url) >>= getResponseBody) :: IO (Either SomeException String)) + +mapLeft :: (a -> c) -> Either a b -> Either c b +mapLeft f (Left l) = Left (f l) +mapLeft _ (Right r) = (Right r) diff --git a/src/Parser/Detail.hs b/src/Parser/Detail.hs new file mode 100644 index 0000000..031d740 --- /dev/null +++ b/src/Parser/Detail.hs @@ -0,0 +1,14 @@ +module Parser.Detail + ( parseDetail + ) where + +import Text.HTML.TagSoup + +import Model.Detail + +import Parser.Utils + +parseDetail :: [Tag String] -> Detail +parseDetail tags = + let description = getTagText "<div class=content>" tags + in Detail { description = description } diff --git a/src/Parser/Resume.hs b/src/Parser/Resume.hs new file mode 100644 index 0000000..bd73912 --- /dev/null +++ b/src/Parser/Resume.hs @@ -0,0 +1,27 @@ +module Parser.Resume + ( parseResumes + ) where + +import Data.Maybe (catMaybes) + +import Text.HTML.TagSoup + +import Model.Resume + +import Parser.Utils + +parseResumes :: String -> [Resume] +parseResumes page = + case sections (~== "<div class=list-lbc>") (parseTags page) of + [] -> + [] + sectionTags : _ -> + let lbcTags = takeWhile (~/= "<div id=alertesCartouche>") sectionTags + in catMaybes . fmap parseResume $ partitions (~== "<a>") lbcTags + +parseResume :: [Tag String] -> Maybe Resume +parseResume item = do + name <- getTagText "<h2 class=title>" item + let price = getTagText "<div class=price>" item + url <- getTagAttribute "<a>" "href" item + return Resume { name = name, price = price, url = url } diff --git a/src/Parser/Utils.hs b/src/Parser/Utils.hs new file mode 100644 index 0000000..4864e00 --- /dev/null +++ b/src/Parser/Utils.hs @@ -0,0 +1,30 @@ +module Parser.Utils + ( getTagAttribute + , getTagText + ) where + +import Data.List (find, findIndex) +import Data.Maybe (listToMaybe) + +import Text.HTML.TagSoup + +getTagAttribute :: String -> String -> [Tag String] -> Maybe String +getTagAttribute selector attribute item = + find (~== selector) item >>= maybeTagAttribute attribute + +getTagText :: String -> [Tag String] -> Maybe String +getTagText selector item = + case findIndex (~== selector) item of + Just index -> fmap trim $ safeGetAt (index + 1) item >>= maybeTagText + Nothing -> Nothing + +maybeTagAttribute :: String -> Tag String -> Maybe String +maybeTagAttribute name (TagOpen _ xs) = + fmap snd . find (\(x, _) -> x == name) $ xs +maybeTagAttribute attribute _ = Nothing + +trim :: String -> String +trim = unwords . words + +safeGetAt :: Int -> [a] -> Maybe a +safeGetAt index = listToMaybe . drop index diff --git a/src/Utils/StringFormat.hs b/src/Utils/StringFormat.hs new file mode 100644 index 0000000..757f0e6 --- /dev/null +++ b/src/Utils/StringFormat.hs @@ -0,0 +1,6 @@ +module Utils.StringFormat + ( formatParagraph + ) where + +formatParagraph : String -> String +formatParagraph = id diff --git a/src/View/Ad.hs b/src/View/Ad.hs new file mode 100644 index 0000000..445ef3a --- /dev/null +++ b/src/View/Ad.hs @@ -0,0 +1,28 @@ +module View.Ad + ( renderAd + ) where + +import Data.List (intersperse) +import Data.Maybe (fromMaybe) + +import Model.Ad (Ad(..)) +import qualified Model.Ad as Ad +import Model.Resume (Resume(..)) +import qualified Model.Resume as Resume +import Model.Detail (Detail(..)) +import qualified Model.Detail as Detail + +renderAd :: Ad -> String +renderAd ad = + (renderResume (Ad.resume ad)) ++ "\n\n" ++ (renderDetail (Ad.detail ad)) ++ "\n" + +renderResume :: Resume -> String +renderResume resume = + let formatPrice price = " - " ++ price + price = fromMaybe "" . fmap formatPrice . Resume.price $ resume + titleLine = (Resume.name resume) ++ price + in concat . intersperse "\n" $ [titleLine, Resume.url resume] + +renderDetail :: Detail -> String +renderDetail detail = + fromMaybe "−" (Detail.description detail) |