From 8e03a571033a5d02d2287a2b1d1edd8b57aa2462 Mon Sep 17 00:00:00 2001 From: Joris Date: Sun, 10 Mar 2019 19:15:45 +0100 Subject: Scrap result page and make a CSV of it --- src/CSV.hs | 17 +++++++++++++ src/HTTP.hs | 11 +++++++++ src/Hiking.hs | 42 ++++++++++++++++++++++++++++++++ src/Main.hs | 34 ++++++++++++++++++++++++++ src/README.md | 5 ++++ src/Scrapper.hs | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 184 insertions(+) create mode 100644 src/CSV.hs create mode 100644 src/HTTP.hs create mode 100644 src/Hiking.hs create mode 100644 src/Main.hs create mode 100644 src/README.md create mode 100644 src/Scrapper.hs (limited to 'src') diff --git a/src/CSV.hs b/src/CSV.hs new file mode 100644 index 0000000..9ad2974 --- /dev/null +++ b/src/CSV.hs @@ -0,0 +1,17 @@ +module CSV (lines, line) where + +import Data.Text (Text) +import qualified Data.Text as T +import Prelude hiding (lines) + +lines :: [[Text]] -> Text +lines = T.unlines . map line + +line :: [Text] -> Text +line = T.intercalate "," . map value + +value :: Text -> Text +value text = + if (T.isInfixOf "," text) + then T.concat [ "\"", text, "\"" ] + else text diff --git a/src/HTTP.hs b/src/HTTP.hs new file mode 100644 index 0000000..5fb3374 --- /dev/null +++ b/src/HTTP.hs @@ -0,0 +1,11 @@ +module HTTP (get) where + +import qualified Data.ByteString.Lazy as BS +import Data.Text (Text) +import qualified Data.Text as T +import Data.Text.Encoding as T +import qualified Data.Text.IO as T +import qualified Network.HTTP.Conduit as HTTP + +get :: Text -> IO Text +get url = (T.decodeUtf8 . BS.toStrict) <$> HTTP.simpleHttp (T.unpack url) diff --git a/src/Hiking.hs b/src/Hiking.hs new file mode 100644 index 0000000..2d04b4f --- /dev/null +++ b/src/Hiking.hs @@ -0,0 +1,42 @@ +module Hiking (Hiking(..), header, values) where + +import Data.Text +import qualified Data.Text as T + +data Hiking = Hiking + { name :: Text + , description :: Text + , lat :: Text + , lng :: Text + , url :: Text + , duration :: Text + , distance :: Text + , elevation :: Text + , difficulty :: Text + } deriving (Eq, Show) + +header :: [Text] +header = + [ "name" + , "lat" + , "lng" + , "link" + , "durée" + , "distance" + , "dénivelé" + , "color" + , "infos" + ] + +values :: Hiking -> [Text] +values h = + [ name h + , lat h + , lng h + , url h + , duration h + , distance h + , elevation h + , difficulty h + , description h + ] diff --git a/src/Main.hs b/src/Main.hs new file mode 100644 index 0000000..0d527b5 --- /dev/null +++ b/src/Main.hs @@ -0,0 +1,34 @@ +module Main (main) where + +import Data.Text (Text) +import qualified Data.Text as T +import qualified Data.Text.IO as T +import qualified System.Environment as Env +import qualified System.Exit as Exit + +import qualified CSV +import Hiking (Hiking (Hiking)) +import qualified Hiking +import qualified HTTP +import qualified Scrapper + +main :: IO () +main = do + url <- getUrl + hikings <- getHikings url + T.putStrLn . CSV.lines . (:) Hiking.header . map Hiking.values $ hikings + +getUrl :: IO Text +getUrl = do + args <- Env.getArgs + case args of + (url:_) -> return $ T.pack url + _ -> Exit.die "No url providen" + +getHikings :: Text -> IO [Hiking] +getHikings url = do + urls <- Scrapper.hikingsUrls <$> HTTP.get url + mapM getHiking urls + +getHiking :: Text -> IO Hiking +getHiking = fmap Scrapper.hiking . HTTP.get diff --git a/src/README.md b/src/README.md new file mode 100644 index 0000000..03c94b4 --- /dev/null +++ b/src/README.md @@ -0,0 +1,5 @@ +# Visorando scrapper + +Produce a CSV of hikings from a [search result +page](https://www.visorando.com/randonnee-pyrenees.html`) on +[visorando](https://www.visorando.com/). diff --git a/src/Scrapper.hs b/src/Scrapper.hs new file mode 100644 index 0000000..456a455 --- /dev/null +++ b/src/Scrapper.hs @@ -0,0 +1,75 @@ +module Scrapper (hikingsUrls, hiking, Hiking(..)) where + +import qualified Data.List as List +import qualified Data.Maybe as Maybe +import Data.Text (Text) +import qualified Data.Text as T +import Text.HTML.TagSoup (Tag (TagOpen, TagText), (~/=), (~==)) +import qualified Text.HTML.TagSoup as S + +import Hiking (Hiking (..)) + +hikingsUrls :: Text -> [Text] +hikingsUrls = + Maybe.catMaybes + . map sectionUrl + . S.partitions (~== ("
" :: String)) + . S.parseTags + where + sectionUrl = + fmap (T.strip . S.fromAttrib "href") + . Maybe.listToMaybe + . dropWhile (~/= ("" :: String)) + +hiking :: Text -> Hiking +hiking page = + Hiking name description lat lng url duration distance elevation difficulty + where + tags = S.parseTags page + + name = textInside "

" "" "

" . dropWhile (~/= ("

" :: String)) $ tags + + (lat, lng) = latAndLng (info "Départ :") + + url = S.fromAttrib "href" . head . dropWhile (~/= ("" :: String)) $ tags + + duration = info "Durée moyenne:" + + distance = info "Distance :" + + elevation = T.concat [ "+", info "Dénivelé positif :", " -", info "Dénivelé négatif :" ] + + difficulty = + case info "Difficulté :" of + "Facile" -> "green" + "Moyenne" -> "blue" + "Difficile" -> "red" + _ -> "gray" + + info title = nextTagText . drop 1 . dropWhile (not . tagTextContains title) $ tags + + tagTextContains title (TagText t) = title == T.strip t + tagTextContains _ _ = False + + +latAndLng :: Text -> (Text, Text) +latAndLng t = + let [ns, lat, _, oe, lng] = T.words t + a = T.concat [ if ns == "S" then "-" else "", T.dropEnd 1 lat ] + b = T.concat [ if oe == "O" then "-" else "", T.dropEnd 1 lng ] + in (a, b) + +textInside :: Text -> Text -> [Tag Text] -> Text +textInside from to = + T.unwords + . map (T.strip) + . T.lines + . T.strip + . S.innerText + . takeWhile (~/= T.unpack to) + . dropWhile (~/= T.unpack from) + +nextTagText :: [Tag Text] -> Text +nextTagText = T.strip . S.innerText . take 1 . drop 1 -- cgit v1.2.3