aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorJoris2019-03-10 19:15:45 +0100
committerJoris2019-03-10 19:15:45 +0100
commit8e03a571033a5d02d2287a2b1d1edd8b57aa2462 (patch)
tree03d6965bb65322dbac03edf2d6a30ee856c197f6 /src
parentc41f16ed474376ad8a61a75d8b7f9ef543f359b4 (diff)
Scrap result page and make a CSV of it
Diffstat (limited to 'src')
-rw-r--r--src/CSV.hs17
-rw-r--r--src/HTTP.hs11
-rw-r--r--src/Hiking.hs42
-rw-r--r--src/Main.hs34
-rw-r--r--src/README.md5
-rw-r--r--src/Scrapper.hs75
6 files changed, 184 insertions, 0 deletions
diff --git a/src/CSV.hs b/src/CSV.hs
new file mode 100644
index 0000000..9ad2974
--- /dev/null
+++ b/src/CSV.hs
@@ -0,0 +1,17 @@
+module CSV (lines, line) where
+
+import Data.Text (Text)
+import qualified Data.Text as T
+import Prelude hiding (lines)
+
+lines :: [[Text]] -> Text
+lines = T.unlines . map line
+
+line :: [Text] -> Text
+line = T.intercalate "," . map value
+
+value :: Text -> Text
+value text =
+ if (T.isInfixOf "," text)
+ then T.concat [ "\"", text, "\"" ]
+ else text
diff --git a/src/HTTP.hs b/src/HTTP.hs
new file mode 100644
index 0000000..5fb3374
--- /dev/null
+++ b/src/HTTP.hs
@@ -0,0 +1,11 @@
+module HTTP (get) where
+
+import qualified Data.ByteString.Lazy as BS
+import Data.Text (Text)
+import qualified Data.Text as T
+import Data.Text.Encoding as T
+import qualified Data.Text.IO as T
+import qualified Network.HTTP.Conduit as HTTP
+
+get :: Text -> IO Text
+get url = (T.decodeUtf8 . BS.toStrict) <$> HTTP.simpleHttp (T.unpack url)
diff --git a/src/Hiking.hs b/src/Hiking.hs
new file mode 100644
index 0000000..2d04b4f
--- /dev/null
+++ b/src/Hiking.hs
@@ -0,0 +1,42 @@
+module Hiking (Hiking(..), header, values) where
+
+import Data.Text
+import qualified Data.Text as T
+
+data Hiking = Hiking
+ { name :: Text
+ , description :: Text
+ , lat :: Text
+ , lng :: Text
+ , url :: Text
+ , duration :: Text
+ , distance :: Text
+ , elevation :: Text
+ , difficulty :: Text
+ } deriving (Eq, Show)
+
+header :: [Text]
+header =
+ [ "name"
+ , "lat"
+ , "lng"
+ , "link"
+ , "durée"
+ , "distance"
+ , "dénivelé"
+ , "color"
+ , "infos"
+ ]
+
+values :: Hiking -> [Text]
+values h =
+ [ name h
+ , lat h
+ , lng h
+ , url h
+ , duration h
+ , distance h
+ , elevation h
+ , difficulty h
+ , description h
+ ]
diff --git a/src/Main.hs b/src/Main.hs
new file mode 100644
index 0000000..0d527b5
--- /dev/null
+++ b/src/Main.hs
@@ -0,0 +1,34 @@
+module Main (main) where
+
+import Data.Text (Text)
+import qualified Data.Text as T
+import qualified Data.Text.IO as T
+import qualified System.Environment as Env
+import qualified System.Exit as Exit
+
+import qualified CSV
+import Hiking (Hiking (Hiking))
+import qualified Hiking
+import qualified HTTP
+import qualified Scrapper
+
+main :: IO ()
+main = do
+ url <- getUrl
+ hikings <- getHikings url
+ T.putStrLn . CSV.lines . (:) Hiking.header . map Hiking.values $ hikings
+
+getUrl :: IO Text
+getUrl = do
+ args <- Env.getArgs
+ case args of
+ (url:_) -> return $ T.pack url
+ _ -> Exit.die "No url providen"
+
+getHikings :: Text -> IO [Hiking]
+getHikings url = do
+ urls <- Scrapper.hikingsUrls <$> HTTP.get url
+ mapM getHiking urls
+
+getHiking :: Text -> IO Hiking
+getHiking = fmap Scrapper.hiking . HTTP.get
diff --git a/src/README.md b/src/README.md
new file mode 100644
index 0000000..03c94b4
--- /dev/null
+++ b/src/README.md
@@ -0,0 +1,5 @@
+# Visorando scrapper
+
+Produce a CSV of hikings from a [search result
+page](https://www.visorando.com/randonnee-pyrenees.html`) on
+[visorando](https://www.visorando.com/).
diff --git a/src/Scrapper.hs b/src/Scrapper.hs
new file mode 100644
index 0000000..456a455
--- /dev/null
+++ b/src/Scrapper.hs
@@ -0,0 +1,75 @@
+module Scrapper (hikingsUrls, hiking, Hiking(..)) where
+
+import qualified Data.List as List
+import qualified Data.Maybe as Maybe
+import Data.Text (Text)
+import qualified Data.Text as T
+import Text.HTML.TagSoup (Tag (TagOpen, TagText), (~/=), (~==))
+import qualified Text.HTML.TagSoup as S
+
+import Hiking (Hiking (..))
+
+hikingsUrls :: Text -> [Text]
+hikingsUrls =
+ Maybe.catMaybes
+ . map sectionUrl
+ . S.partitions (~== ("<div class=rando-title-sansDetail>" :: String))
+ . S.parseTags
+ where
+ sectionUrl =
+ fmap (T.strip . S.fromAttrib "href")
+ . Maybe.listToMaybe
+ . dropWhile (~/= ("<a>" :: String))
+
+hiking :: Text -> Hiking
+hiking page =
+ Hiking name description lat lng url duration distance elevation difficulty
+ where
+ tags = S.parseTags page
+
+ name = textInside "<h1 itemprop=name>" "</h1" tags
+
+ description = textInside "<p>" "</p>" . dropWhile (~/= ("<h1 itemprop=name>" :: String)) $ tags
+
+ (lat, lng) = latAndLng (info "Départ :")
+
+ url = S.fromAttrib "href" . head . dropWhile (~/= ("<link>" :: String)) $ tags
+
+ duration = info "Durée moyenne:"
+
+ distance = info "Distance :"
+
+ elevation = T.concat [ "+", info "Dénivelé positif :", " -", info "Dénivelé négatif :" ]
+
+ difficulty =
+ case info "Difficulté :" of
+ "Facile" -> "green"
+ "Moyenne" -> "blue"
+ "Difficile" -> "red"
+ _ -> "gray"
+
+ info title = nextTagText . drop 1 . dropWhile (not . tagTextContains title) $ tags
+
+ tagTextContains title (TagText t) = title == T.strip t
+ tagTextContains _ _ = False
+
+
+latAndLng :: Text -> (Text, Text)
+latAndLng t =
+ let [ns, lat, _, oe, lng] = T.words t
+ a = T.concat [ if ns == "S" then "-" else "", T.dropEnd 1 lat ]
+ b = T.concat [ if oe == "O" then "-" else "", T.dropEnd 1 lng ]
+ in (a, b)
+
+textInside :: Text -> Text -> [Tag Text] -> Text
+textInside from to =
+ T.unwords
+ . map (T.strip)
+ . T.lines
+ . T.strip
+ . S.innerText
+ . takeWhile (~/= T.unpack to)
+ . dropWhile (~/= T.unpack from)
+
+nextTagText :: [Tag Text] -> Text
+nextTagText = T.strip . S.innerText . take 1 . drop 1