From 8e03a571033a5d02d2287a2b1d1edd8b57aa2462 Mon Sep 17 00:00:00 2001 From: Joris Date: Sun, 10 Mar 2019 19:15:45 +0100 Subject: Scrap result page and make a CSV of it --- src/CSV.hs | 17 +++++++++++++ src/HTTP.hs | 11 +++++++++ src/Hiking.hs | 42 ++++++++++++++++++++++++++++++++ src/Main.hs | 34 ++++++++++++++++++++++++++ src/README.md | 5 ++++ src/Scrapper.hs | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 184 insertions(+) create mode 100644 src/CSV.hs create mode 100644 src/HTTP.hs create mode 100644 src/Hiking.hs create mode 100644 src/Main.hs create mode 100644 src/README.md create mode 100644 src/Scrapper.hs (limited to 'src') diff --git a/src/CSV.hs b/src/CSV.hs new file mode 100644 index 0000000..9ad2974 --- /dev/null +++ b/src/CSV.hs @@ -0,0 +1,17 @@ +module CSV (lines, line) where + +import Data.Text (Text) +import qualified Data.Text as T +import Prelude hiding (lines) + +lines :: [[Text]] -> Text +lines = T.unlines . map line + +line :: [Text] -> Text +line = T.intercalate "," . map value + +value :: Text -> Text +value text = + if (T.isInfixOf "," text) + then T.concat [ "\"", text, "\"" ] + else text diff --git a/src/HTTP.hs b/src/HTTP.hs new file mode 100644 index 0000000..5fb3374 --- /dev/null +++ b/src/HTTP.hs @@ -0,0 +1,11 @@ +module HTTP (get) where + +import qualified Data.ByteString.Lazy as BS +import Data.Text (Text) +import qualified Data.Text as T +import Data.Text.Encoding as T +import qualified Data.Text.IO as T +import qualified Network.HTTP.Conduit as HTTP + +get :: Text -> IO Text +get url = (T.decodeUtf8 . BS.toStrict) <$> HTTP.simpleHttp (T.unpack url) diff --git a/src/Hiking.hs b/src/Hiking.hs new file mode 100644 index 0000000..2d04b4f --- /dev/null +++ b/src/Hiking.hs @@ -0,0 +1,42 @@ +module Hiking (Hiking(..), header, values) where + +import Data.Text +import qualified Data.Text as T + +data Hiking = Hiking + { name :: Text + , description :: Text + , lat :: Text + , lng :: Text + , url :: Text + , duration :: Text + , distance :: Text + , elevation :: Text + , difficulty :: Text + } deriving (Eq, Show) + +header :: [Text] +header = + [ "name" + , "lat" + , "lng" + , "link" + , "durée" + , "distance" + , "dénivelé" + , "color" + , "infos" + ] + +values :: Hiking -> [Text] +values h = + [ name h + , lat h + , lng h + , url h + , duration h + , distance h + , elevation h + , difficulty h + , description h + ] diff --git a/src/Main.hs b/src/Main.hs new file mode 100644 index 0000000..0d527b5 --- /dev/null +++ b/src/Main.hs @@ -0,0 +1,34 @@ +module Main (main) where + +import Data.Text (Text) +import qualified Data.Text as T +import qualified Data.Text.IO as T +import qualified System.Environment as Env +import qualified System.Exit as Exit + +import qualified CSV +import Hiking (Hiking (Hiking)) +import qualified Hiking +import qualified HTTP +import qualified Scrapper + +main :: IO () +main = do + url <- getUrl + hikings <- getHikings url + T.putStrLn . CSV.lines . (:) Hiking.header . map Hiking.values $ hikings + +getUrl :: IO Text +getUrl = do + args <- Env.getArgs + case args of + (url:_) -> return $ T.pack url + _ -> Exit.die "No url providen" + +getHikings :: Text -> IO [Hiking] +getHikings url = do + urls <- Scrapper.hikingsUrls <$> HTTP.get url + mapM getHiking urls + +getHiking :: Text -> IO Hiking +getHiking = fmap Scrapper.hiking . HTTP.get diff --git a/src/README.md b/src/README.md new file mode 100644 index 0000000..03c94b4 --- /dev/null +++ b/src/README.md @@ -0,0 +1,5 @@ +# Visorando scrapper + +Produce a CSV of hikings from a [search result +page](https://www.visorando.com/randonnee-pyrenees.html`) on +[visorando](https://www.visorando.com/). diff --git a/src/Scrapper.hs b/src/Scrapper.hs new file mode 100644 index 0000000..456a455 --- /dev/null +++ b/src/Scrapper.hs @@ -0,0 +1,75 @@ +module Scrapper (hikingsUrls, hiking, Hiking(..)) where + +import qualified Data.List as List +import qualified Data.Maybe as Maybe +import Data.Text (Text) +import qualified Data.Text as T +import Text.HTML.TagSoup (Tag (TagOpen, TagText), (~/=), (~==)) +import qualified Text.HTML.TagSoup as S + +import Hiking (Hiking (..)) + +hikingsUrls :: Text -> [Text] +hikingsUrls = + Maybe.catMaybes + . map sectionUrl + . S.partitions (~== ("