diff options
author | Joris | 2019-03-10 19:15:45 +0100 |
---|---|---|
committer | Joris | 2019-03-10 19:15:45 +0100 |
commit | 8e03a571033a5d02d2287a2b1d1edd8b57aa2462 (patch) | |
tree | 03d6965bb65322dbac03edf2d6a30ee856c197f6 | |
parent | c41f16ed474376ad8a61a75d8b7f9ef543f359b4 (diff) |
Scrap result page and make a CSV of it
-rw-r--r-- | .gitignore | 2 | ||||
-rw-r--r-- | .tmuxinator.yml | 10 | ||||
-rw-r--r-- | LICENSE | 30 | ||||
-rw-r--r-- | default.nix | 13 | ||||
-rwxr-xr-x | dev | 18 | ||||
-rw-r--r-- | src/CSV.hs | 17 | ||||
-rw-r--r-- | src/HTTP.hs | 11 | ||||
-rw-r--r-- | src/Hiking.hs | 42 | ||||
-rw-r--r-- | src/Main.hs | 34 | ||||
-rw-r--r-- | src/README.md | 5 | ||||
-rw-r--r-- | src/Scrapper.hs | 75 | ||||
-rw-r--r-- | visorando-scraper.cabal | 25 |
12 files changed, 282 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..697a2c6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +dist-newstyle +.ghc.environment.* diff --git a/.tmuxinator.yml b/.tmuxinator.yml new file mode 100644 index 0000000..e81d3e8 --- /dev/null +++ b/.tmuxinator.yml @@ -0,0 +1,10 @@ +name: visorando-parser + +windows: + - app: + panes: + - nodemon -e hs --exec 'clear && cabal new-run --verbose=0' + # - ghcid --command "cabal new-repl" -T ":main" + # - cabal new-run + # - ghcid --command "cabal new-repl" -T ":main" + # - cabal new-build @@ -0,0 +1,30 @@ +Copyright (c) 2019, Joris + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of Joris nor the names of other + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/default.nix b/default.nix new file mode 100644 index 0000000..2f1a1f8 --- /dev/null +++ b/default.nix @@ -0,0 +1,13 @@ +with import <nixpkgs> {}; { + env = stdenv.mkDerivation { + name = "env"; + buildInputs = [ + cabal-install + tmux + tmuxinator + nodePackages.nodemon + # haskellPackages.ghcid + zlib + ]; + }; +} @@ -0,0 +1,18 @@ +#!/usr/bin/env bash +cd "$(dirname $0)" +CMD="$1" + +if [ "$CMD" = "start" ]; then + + nix-shell --command "tmuxinator local" + +elif [ "$CMD" = "stop" ]; then + + nix-shell --command "tmux kill-session -t visorando-scraper" + +else + + echo "Usage: $0 start|stop" + exit 1 + +fi diff --git a/src/CSV.hs b/src/CSV.hs new file mode 100644 index 0000000..9ad2974 --- /dev/null +++ b/src/CSV.hs @@ -0,0 +1,17 @@ +module CSV (lines, line) where + +import Data.Text (Text) +import qualified Data.Text as T +import Prelude hiding (lines) + +lines :: [[Text]] -> Text +lines = T.unlines . map line + +line :: [Text] -> Text +line = T.intercalate "," . map value + +value :: Text -> Text +value text = + if (T.isInfixOf "," text) + then T.concat [ "\"", text, "\"" ] + else text diff --git a/src/HTTP.hs b/src/HTTP.hs new file mode 100644 index 0000000..5fb3374 --- /dev/null +++ b/src/HTTP.hs @@ -0,0 +1,11 @@ +module HTTP (get) where + +import qualified Data.ByteString.Lazy as BS +import Data.Text (Text) +import qualified Data.Text as T +import Data.Text.Encoding as T +import qualified Data.Text.IO as T +import qualified Network.HTTP.Conduit as HTTP + +get :: Text -> IO Text +get url = (T.decodeUtf8 . BS.toStrict) <$> HTTP.simpleHttp (T.unpack url) diff --git a/src/Hiking.hs b/src/Hiking.hs new file mode 100644 index 0000000..2d04b4f --- /dev/null +++ b/src/Hiking.hs @@ -0,0 +1,42 @@ +module Hiking (Hiking(..), header, values) where + +import Data.Text +import qualified Data.Text as T + +data Hiking = Hiking + { name :: Text + , description :: Text + , lat :: Text + , lng :: Text + , url :: Text + , duration :: Text + , distance :: Text + , elevation :: Text + , difficulty :: Text + } deriving (Eq, Show) + +header :: [Text] +header = + [ "name" + , "lat" + , "lng" + , "link" + , "durée" + , "distance" + , "dénivelé" + , "color" + , "infos" + ] + +values :: Hiking -> [Text] +values h = + [ name h + , lat h + , lng h + , url h + , duration h + , distance h + , elevation h + , difficulty h + , description h + ] diff --git a/src/Main.hs b/src/Main.hs new file mode 100644 index 0000000..0d527b5 --- /dev/null +++ b/src/Main.hs @@ -0,0 +1,34 @@ +module Main (main) where + +import Data.Text (Text) +import qualified Data.Text as T +import qualified Data.Text.IO as T +import qualified System.Environment as Env +import qualified System.Exit as Exit + +import qualified CSV +import Hiking (Hiking (Hiking)) +import qualified Hiking +import qualified HTTP +import qualified Scrapper + +main :: IO () +main = do + url <- getUrl + hikings <- getHikings url + T.putStrLn . CSV.lines . (:) Hiking.header . map Hiking.values $ hikings + +getUrl :: IO Text +getUrl = do + args <- Env.getArgs + case args of + (url:_) -> return $ T.pack url + _ -> Exit.die "No url providen" + +getHikings :: Text -> IO [Hiking] +getHikings url = do + urls <- Scrapper.hikingsUrls <$> HTTP.get url + mapM getHiking urls + +getHiking :: Text -> IO Hiking +getHiking = fmap Scrapper.hiking . HTTP.get diff --git a/src/README.md b/src/README.md new file mode 100644 index 0000000..03c94b4 --- /dev/null +++ b/src/README.md @@ -0,0 +1,5 @@ +# Visorando scrapper + +Produce a CSV of hikings from a [search result +page](https://www.visorando.com/randonnee-pyrenees.html`) on +[visorando](https://www.visorando.com/). diff --git a/src/Scrapper.hs b/src/Scrapper.hs new file mode 100644 index 0000000..456a455 --- /dev/null +++ b/src/Scrapper.hs @@ -0,0 +1,75 @@ +module Scrapper (hikingsUrls, hiking, Hiking(..)) where + +import qualified Data.List as List +import qualified Data.Maybe as Maybe +import Data.Text (Text) +import qualified Data.Text as T +import Text.HTML.TagSoup (Tag (TagOpen, TagText), (~/=), (~==)) +import qualified Text.HTML.TagSoup as S + +import Hiking (Hiking (..)) + +hikingsUrls :: Text -> [Text] +hikingsUrls = + Maybe.catMaybes + . map sectionUrl + . S.partitions (~== ("<div class=rando-title-sansDetail>" :: String)) + . S.parseTags + where + sectionUrl = + fmap (T.strip . S.fromAttrib "href") + . Maybe.listToMaybe + . dropWhile (~/= ("<a>" :: String)) + +hiking :: Text -> Hiking +hiking page = + Hiking name description lat lng url duration distance elevation difficulty + where + tags = S.parseTags page + + name = textInside "<h1 itemprop=name>" "</h1" tags + + description = textInside "<p>" "</p>" . dropWhile (~/= ("<h1 itemprop=name>" :: String)) $ tags + + (lat, lng) = latAndLng (info "Départ :") + + url = S.fromAttrib "href" . head . dropWhile (~/= ("<link>" :: String)) $ tags + + duration = info "Durée moyenne:" + + distance = info "Distance :" + + elevation = T.concat [ "+", info "Dénivelé positif :", " -", info "Dénivelé négatif :" ] + + difficulty = + case info "Difficulté :" of + "Facile" -> "green" + "Moyenne" -> "blue" + "Difficile" -> "red" + _ -> "gray" + + info title = nextTagText . drop 1 . dropWhile (not . tagTextContains title) $ tags + + tagTextContains title (TagText t) = title == T.strip t + tagTextContains _ _ = False + + +latAndLng :: Text -> (Text, Text) +latAndLng t = + let [ns, lat, _, oe, lng] = T.words t + a = T.concat [ if ns == "S" then "-" else "", T.dropEnd 1 lat ] + b = T.concat [ if oe == "O" then "-" else "", T.dropEnd 1 lng ] + in (a, b) + +textInside :: Text -> Text -> [Tag Text] -> Text +textInside from to = + T.unwords + . map (T.strip) + . T.lines + . T.strip + . S.innerText + . takeWhile (~/= T.unpack to) + . dropWhile (~/= T.unpack from) + +nextTagText :: [Tag Text] -> Text +nextTagText = T.strip . S.innerText . take 1 . drop 1 diff --git a/visorando-scraper.cabal b/visorando-scraper.cabal new file mode 100644 index 0000000..7dd24b6 --- /dev/null +++ b/visorando-scraper.cabal @@ -0,0 +1,25 @@ +name: visorando-scraper +version: 0.1.0.0 +license: BSD3 +license-file: LICENSE +author: Joris +maintainer: joris@guyonvarch.me +category: Data +build-type: Simple +extra-source-files: ChangeLog.md +cabal-version: >=1.10 + +executable visorando-scraper + hs-source-dirs: src + main-is: Main.hs + build-depends: base + , bytestring + , text + , tagsoup + , http-conduit + default-language: Haskell2010 + default-extensions: OverloadedStrings + other-modules: HTTP + , Scrapper + , Hiking + , CSV |