added scraper

2025-06-28 17:16:27 -04:00 · 2025-06-28 17:16:27 -04:00 · be776884a3
commit be776884a3
parent a9e1752428
11 changed files with 6529 additions and 24 deletions
--- a/README.txt
+++ b/README.txt
@ -10,41 +10,41 @@ git clone https://www.gitea.econlabs.org/markusmobiuspublic/RetroIndex
 The navigate to RetroIndex folder:
 cd RetroSampleApp
 dotnet publish
-cd bin\Debug\net7.0\publish
+cd bin\Release\net9.0\publish
 dotnet RetroSampleApp.dll


 Expected Output
 -------------

-7 stores are available between 2/15/2023 12:00:00 AM and 2/21/2023 12:00:00 AM:
-first available time stub: 2023_02_15_00_00_00
- last available time stub: 2023_02_21_00_00_00
-Opening datastore 2023_02_15_00_00_00 from disk ...
-Opening datastore 2023_02_16_00_00_00 from disk ...
-Opening datastore 2023_02_17_00_00_00 from disk ...
-Opening datastore 2023_02_18_00_00_00 from disk ...
-Opening datastore 2023_02_19_00_00_00 from disk ...
-Opening datastore 2023_02_20_00_00_00 from disk ...
-Opening datastore 2023_02_21_00_00_00 from disk ...
-publishers: 9020
+7 stores are available between 6/15/2025 12:00:00 AM and 6/22/2025 12:00:00 AM:
+first available time stub: 2025_06_16_00_00_00
+ last available time stub: 2025_06_22_00_00_00
+Opening datastore 2025_06_16_00_00_00 from disk ...
+Opening datastore 2025_06_17_00_00_00 from disk ...
+Opening datastore 2025_06_18_00_00_00 from disk ...
+Opening datastore 2025_06_19_00_00_00 from disk ...
+Opening datastore 2025_06_20_00_00_00 from disk ...
+Opening datastore 2025_06_21_00_00_00 from disk ...
+Opening datastore 2025_06_22_00_00_00 from disk ...
+publishers: 8594
 nytimes.com: exists in data
 washingtonpost.com: exists in data
 Reading filtered publishers ..
 -----------------------
 Publisher: nytimes.com
-7/7 days: ["2023_02_15_00_00_00","2023_02_16_00_00_00","2023_02_17_00_00_00","2023_02_18_00_00_00","2023_02_19_00_00_00","2023_02_20_00_00_00","2023_02_21_00_00_00"]
-total pages: 105223
-pages with publish data between 2/15/2023 12:00:00 AM and 2/21/2023 12:00:00 AM: 17030
+7/7 days: ["2025_06_16_00_00_00","2025_06_17_00_00_00","2025_06_18_00_00_00","2025_06_19_00_00_00","2025_06_20_00_00_00","2025_06_21_00_00_00","2025_06_22_00_00_00"]
+total pages: 36046
+pages with publish data between 6/15/2025 12:00:00 AM and 6/22/2025 12:00:00 AM: 604
 -----------------------
 Publisher: washingtonpost.com
-7/7 days: ["2023_02_15_00_00_00","2023_02_16_00_00_00","2023_02_17_00_00_00","2023_02_18_00_00_00","2023_02_19_00_00_00","2023_02_20_00_00_00","2023_02_21_00_00_00"]
-total pages: 28779
-pages with publish data between 2/15/2023 12:00:00 AM and 2/21/2023 12:00:00 AM: 4143
+7/7 days: ["2025_06_16_00_00_00","2025_06_17_00_00_00","2025_06_18_00_00_00","2025_06_19_00_00_00","2025_06_20_00_00_00","2025_06_21_00_00_00","2025_06_22_00_00_00"]
+total pages: 19560
+pages with publish data between 6/15/2025 12:00:00 AM and 6/22/2025 12:00:00 AM: 828
 -----------------------


-In order to incorporate in your own project make sure you reference the BingScrapeReadLib (1.5.0+) nuget package:
-https://www.gitea.econlabs.org/markusmobiuspublic/-/packages/nuget/bingscrapereadlib/1.5.0
+In order to incorporate in your own project make sure you reference the BingScrapeReadLib (2.0.0+) nuget package:
+https://www.gitea.econlabs.org/markusmobiuspublic/-/packages/nuget/bingscrapereadlib/2.0.0

 The RetroSampleApp demonstrates how.
--- a/RetroExtractor/Program.cs
+++ b/RetroExtractor/Program.cs
@ -0,0 +1,157 @@
+using BingScrapeData;
+using BingScrapeReadLib;
+using Newtonsoft.Json;
+
+namespace RetroExtractor
+{
+    internal class Program
+    {
+        static void Main(string[] args)
+        {
+            //read config
+            string config_file = "config.json";
+            if (args.Length > 0)
+            {
+                config_file = args[0];
+                if (!File.Exists(config_file))
+                {
+                    Console.WriteLine("Cannot find config file {0}", config_file);
+                    return;
+                }
+            }
+            config.populate(config_file);
+
+            if (Directory.Exists(config.param.OutputFolder))
+            {
+                Console.WriteLine($"output folder {config.param.OutputFolder} already exists");
+                return;
+            }
+            Directory.CreateDirectory(config.param.OutputFolder);
+
+            //open data stores between start and end (start and end dates are inclusive)
+            var startDate = config.param.ScrapeDate.Start.GetDate();
+            var endDate = config.param.ScrapeDate.End.GetDate();
+            var reader = new Reader(config.param.BaseFolder, startDate, endDate);
+
+            //count the number of available stores
+            var stubs = reader.GetStubs();
+            Console.WriteLine($"{stubs.Length} stores are available between {startDate} and {endDate}:");
+            if (stubs.Length > 0)
+            {
+                Console.WriteLine($"first available time stub: {stubs.First()}");
+                Console.WriteLine($" last available time stub: {stubs.Last()}");
+            }
+
+            //now we open the stores for processing (this just reads the publisher indices across stores, not yet the entire store)
+            reader.Open();
+            //now we "seal" the stores and lock in the selection (you can include non-existing publishers in the selection - they will be ignored
+            var pubSelection = config.param.PubSelection;
+            reader.SealStores(pubSelection);
+
+            //get list of publishers and check if nytimes.com and washingtonpost.com is among them
+            var pubs = reader.GetPublishers();
+            Console.WriteLine($"publishers: {pubs.Length}");
+            foreach (var pub in pubSelection)
+            {
+                if (pubs.Contains(pub))
+                {
+                    Console.WriteLine($"{pub}: exists in data");
+                }
+                else
+                {
+                    Console.WriteLine($"{pub}: does not exist in data");
+                }
+            }
+
+            //now we retrieve selected publishers until no more left
+            //memory requirement for client is about 1GB per publisher (for large ones like NYT)  and day            
+            Console.WriteLine("Reading filtered publishers ..");
+            var publishers = new List<string>();
+            while (true)
+            {
+                Console.WriteLine($"-----------------------");
+                var next = reader.ReadNextPublisher();
+                if (next == null)
+                {
+                    //nothing left
+                    break;
+                }
+                //now write some summary stats for this publisher
+                Console.WriteLine($"Publisher: {next.pub}");
+                int pubID = publishers.Count;
+                publishers.Add(next.pub);
+                Console.WriteLine($"{next.PublisherByStub.Count}/{stubs.Length} days: {JsonConvert.SerializeObject(next.PublisherByStub.Keys.ToArray())}");
+                int pages = 0;
+                var freshScrapes = new List<ProcessedBingData>();
+                DateTime? start = null;
+                DateTime? end = null;
+                if (config.param.ArticleDate.ApplyFilter)
+                {
+                    start = config.param.ArticleDate.Start.GetDate();
+                    end = config.param.ArticleDate.End.GetDate();
+                }
+                foreach (var kvp in next.PublisherByStub)
+                {
+                    var pubdata = kvp.Value;
+                    pages += pubdata.scrapes.Count;
+                    foreach (var scrape in pubdata.scrapes)
+                    {
+                        //scrape has all the data on the article
+                        //for example you get the text using the readability extractor as follows:
+                        if (scrape.readability != null)
+                        {
+                            var text = scrape.readability.text.getCleanedText();
+                        }
+                        //we count the number of "fresh" pages with publish date in the range that we pulled
+                        bool isValidPage = false;
+                        if (!config.param.ArticleDate.ApplyFilter)
+                        {
+                            isValidPage = true;
+                        }
+                        else
+                        {
+                            if (scrape.PublishDate != null)
+                            {
+                                var dt = scrape.PublishDate.Date;
+                                if (dt > (DateTime) start && dt < (DateTime) end)
+                                {
+                                    isValidPage = true;
+                                }
+                            }
+                        }
+                        if (isValidPage)
+                        {
+                            freshScrapes.Add(scrape);
+                        }
+                    }
+                }
+                if (config.param.ArticleDate.ApplyFilter)
+                { 
+                    Console.WriteLine($"filtered pages between {start} and {end}: {freshScrapes.Count}/{pages}");
+                }
+                else
+                {
+                    Console.WriteLine($"filtered pages (all): {freshScrapes.Count}/{pages}");
+                }
+                serializeLargeJSON2File(freshScrapes, $"{config.param.OutputFolder}/{pubID}.json");
+            }
+            var sw = new StreamWriter($"{config.param.OutputFolder}/publishers.txt");
+            for(int i=0;i<publishers.Count;i++)
+            {
+                sw.WriteLine(publishers[i]);
+            }
+            sw.Close();
+        }
+
+        static void serializeLargeJSON2File(object value, string fileName)
+        {
+            using (StreamWriter writer = new StreamWriter(fileName))
+            using (JsonTextWriter jsonWriter = new JsonTextWriter(writer))
+            {
+                JsonSerializer ser = new JsonSerializer();
+                ser.Serialize(jsonWriter, value);
+                jsonWriter.Flush();
+            }
+        }
+    }
+}
--- a/RetroExtractor/RetroExtractor.csproj
+++ b/RetroExtractor/RetroExtractor.csproj
@ -0,0 +1,23 @@
+<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <OutputType>Exe</OutputType>
+    <TargetFramework>net9.0</TargetFramework>
+    <ImplicitUsings>enable</ImplicitUsings>
+    <Nullable>enable</Nullable>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <PackageReference Include="BingScrapeReadLib" Version="2.0.0" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <None Update="config_blira_2025_unfiltered.json">
+      <CopyToOutputDirectory>Always</CopyToOutputDirectory>
+    </None>
+    <None Update="config_blira_2025_filtered.json">
+      <CopyToOutputDirectory>Always</CopyToOutputDirectory>
+    </None>
+  </ItemGroup>
+
+</Project>
--- a/RetroExtractor/config_blira_2025_filtered.json
+++ b/RetroExtractor/config_blira_2025_filtered.json
@ -0,0 +1,30 @@
+{
+  "BaseFolder": "//nerds21/retro2025",
+  "ScrapeDate": {
+    "Start": {
+      "Year": 2025,
+      "Month": 6,
+      "Day": 15
+    },
+    "End": {
+      "Year": 2025,
+      "Month": 6,
+      "Day": 22
+    }
+  },
+  "ArticleDate": {
+    "ApplyFilter": true,
+    "Start": {
+      "Year": 2025,
+      "Month": 6,
+      "Day": 15
+    },
+    "End": {
+      "Year": 2025,
+      "Month": 6,
+      "Day": 22
+    }
+  },
+  "PubSelection": [ "bbc.com", "cnn.com", "nytimes.com", "washingtonpost.com" ],
+  "OutputFolder" : "export"
+}
--- a/RetroExtractor/config_blira_2025_unfiltered.json
+++ b/RetroExtractor/config_blira_2025_unfiltered.json
@ -0,0 +1,20 @@
+{
+  "BaseFolder": "//nerds21/retro2025",
+  "ScrapeDate": {
+    "Start": {
+      "Year": 2025,
+      "Month": 6,
+      "Day": 15
+    },
+    "End": {
+      "Year": 2025,
+      "Month": 6,
+      "Day": 22
+    }
+  },
+  "ArticleDate": {
+    "ApplyFilter": false
+  },
+  "PubSelection": [ "bbc.com", "cnn.com", "nytimes.com", "washingtonpost.com" ],
+  "OutputFolder": "export"
+}
--- a/RetroExtractor/configuration.cs
+++ b/RetroExtractor/configuration.cs
@ -0,0 +1,54 @@
+using System;
+using System.IO;
+using Newtonsoft.Json;
+using System.Threading;
+
+namespace RetroExtractor
+{
+    public class Range
+    {
+        public int Year;
+        public int Month;
+        public int Day;
+
+        public DateTime GetDate()
+        {
+            return new DateTime(Year, Month, Day);
+        }
+    }
+
+    public class ScrapeRange
+    {
+        public Range Start;
+        public Range End;
+    }
+
+    public class ArticleRange
+    {
+        public bool ApplyFilter;
+        public Range Start;
+        public Range End;
+    }
+
+    public class configuration
+    {
+        public string BaseFolder;
+        public ScrapeRange ScrapeDate;
+        public ArticleRange ArticleDate;
+        public string[] PubSelection;
+        public string OutputFolder;
+    }
+
+    static class config
+    {
+        public static configuration param;
+
+        public static void populate(string configpath)
+        {
+            StreamReader sr = new StreamReader(configpath);
+            param = JsonConvert.DeserializeObject<configuration>(sr.ReadToEnd());
+        }
+
+    }
+
+}
--- a/RetroIndex.sln
+++ b/RetroIndex.sln
@ -7,9 +7,12 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "RetroSampleApp", "RetroSamp
 EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{A68C05D6-1843-4166-A34A-528AE637AC73}"
 	ProjectSection(SolutionItems) = preProject
+		publishers_usa.txt = publishers_usa.txt
 		README.txt = README.txt
 	EndProjectSection
 EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "RetroExtractor", "RetroExtractor\RetroExtractor.csproj", "{95A8EC70-30EA-4452-B0EE-CF8192C9013B}"
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|Any CPU = Debug|Any CPU
@ -20,6 +23,10 @@ Global
 		{4B4FD66B-68EF-4B63-9DED-6E4D22222C9E}.Debug|Any CPU.Build.0 = Debug|Any CPU
 		{4B4FD66B-68EF-4B63-9DED-6E4D22222C9E}.Release|Any CPU.ActiveCfg = Release|Any CPU
 		{4B4FD66B-68EF-4B63-9DED-6E4D22222C9E}.Release|Any CPU.Build.0 = Release|Any CPU
+		{95A8EC70-30EA-4452-B0EE-CF8192C9013B}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{95A8EC70-30EA-4452-B0EE-CF8192C9013B}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{95A8EC70-30EA-4452-B0EE-CF8192C9013B}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{95A8EC70-30EA-4452-B0EE-CF8192C9013B}.Release|Any CPU.Build.0 = Release|Any CPU
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
--- a/RetroSampleApp/Program.cs
+++ b/RetroSampleApp/Program.cs
@ -9,9 +9,9 @@ namespace RetroSampleApp
        static void Main(string[] args)
        {
            //open data stores between 2/15/2023 and 2/21/2023 (start and end dates are inclusive)
-            var startDate = new DateTime(2023, 2, 15);
-            var endDate = new DateTime(2023, 2, 21);
-            var reader = new Reader("//tinybee/retroindexprocessed2023", startDate, endDate);
+            var startDate = new DateTime(2025, 6, 15);
+            var endDate = new DateTime(2025, 6, 22);
+            var reader = new Reader("//nerds21/retro2025", startDate, endDate);

            //count the number of available stores
            var stubs = reader.GetStubs();
--- a/RetroSampleApp/RetroSampleApp.csproj
+++ b/RetroSampleApp/RetroSampleApp.csproj
@ -2,7 +2,7 @@

  <PropertyGroup>
    <OutputType>Exe</OutputType>
-    <TargetFramework>net7.0</TargetFramework>
+    <TargetFramework>net9.0</TargetFramework>
    <ImplicitUsings>enable</ImplicitUsings>
    <Nullable>enable</Nullable>
  </PropertyGroup>
--- a/RetroSampleApp/config_blira_2025.json
+++ b/RetroSampleApp/config_blira_2025.json
@ -0,0 +1,15 @@
+{
+  "BaseFolder": "//nerds21/retro2025",
+  "StartDate": {
+    "Year": 2025,
+    "Month": 6,
+    "Day": 15
+  },
+  "EndDate": {
+    "Year": 2025,
+    "Month": 6,
+    "Day": 21
+  },
+  "PubSelection": [ "bbc.com", "cnn.com", "nytimes.com", "washingtonpost.com" ]
+}
+
--- a/publishers_usa.txt
+++ b/publishers_usa.txt