added two new FAQs (#21)

microsoft · Jun 9, 2022 · 9f23956 · 9f23956
1 parent 16846e5
commit 9f23956
Show file tree

Hide file tree

Showing 2 changed files with 90 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -309,6 +309,14 @@ Maybe. This is a work in progress. Also, check out our other building releases!
 * [Kenya and Nigeria](https://github.com/microsoft/KenyaNigeriaBuildingFootprints)
 * [Indonesia, Malaysia, and the Philippines](https://github.com/microsoft/IdMyPhBuildingFootprints)
 
+### Why are some locations missing?
+We excluded imagery from processing if tiles were dated before 2014 or there was a low-probability of detection. Detection probability is loosely defined here as proximity to roads and population centers. This filtering and tile exclusion results in squares of missing data. 
+
+### How can I read large files?
+Some files are very large but they are stored in line-delimited format so one could use parallel processing tools (e.g., [Spark](https://spark.apache.org/), [Dask](https://docs.dask.org/en/stable/dataframe.html)) or create a memory 
+efficient script to segment into smaller pieces. See `scripts/read-large-files.py` for a Python example. 
+
+
 <br>
 
 ## Contributing

diff --git a/scripts/read-large-files.py b/scripts/read-large-files.py
@@ -0,0 +1,82 @@
+
+"""
+This python script is an example of how to read a large file, line-delimited and split it into multiple
+parts. This can be helpful when using a machine that cannot load an entire file into memory. 
+"""
+import os
+
+def main():
+    # path to decompressed geojsonl file
+    input_file = "Angola.geojsonl"
+
+    # check to make sure we can find the input file
+    assert os.path.exists(input_file), f"{input_file} not found!"
+
+    # template output file path. the script will populate the curly brackets {} with a number
+    output_file_template = "Angola_part-{}.geojsonl"
+
+    # this is the maximum number of features per file. adjust as desired. 10k features produces ~3MB files. 
+    buildings_per_file = 10_000
+
+    # open the large file
+    with open(input_file) as inf:
+        # read a single line
+        line = inf.readline()
+
+        # used for updating file numbers
+        file_counter = 1
+
+        # this is where we count the number of features in a single file
+        lines_per_file = 0
+
+        # create the actual file path fome the template above
+        current_target_file_path = output_file_template.format(file_counter)
+
+        # prevent overwriting existing files
+        assert not os.path.exists(current_target_file_path), f"{current_target_file_path} already exists!"
+
+        # open an output file in write mode. 
+        target = open(current_target_file_path, 'w')
+
+        # start iterating through each feature
+        while line:
+
+            # write a single feature to the current output files
+            target.write(line)
+
+            # increment the count for number of features in a file
+            lines_per_file += 1
+
+            # go to next feature in the large file
+            line = inf.readline()
+
+            # check if we have hit the desire feature limit per file
+            if lines_per_file == buildings_per_file:
+                # close the current target file since we've reached the desired feature limit
+                target.close()
+                print(f"wrote {lines_per_file:,} lines to {current_target_file_path}")
+
+                # increment the file counter so we can create a new output
+                file_counter += 1
+
+                # reset the line counter for the new output file
+                lines_per_file = 0 
+
+                # create the path for the next output file
+                current_target_file_path = output_file_template.format(file_counter)
+                # prevent overwriting existing files
+                assert not os.path.exists(current_target_file_path), f"{current_target_file_path} already exists!"
+
+                # open the next output file
+                target = open(current_target_file_path, 'w')
+
+        # when we get here, there are no more features left in the larger file so we close the last target file
+        if not target.closed:
+            print(f"wrote {lines_per_file:,} lines to {current_target_file_path}")
+            target.close()
+    print(f"Complete!")
+
+
+if __name__ == "__main__":
+    main()
+