-
Notifications
You must be signed in to change notification settings - Fork 76
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #153 from blockchain-etl/refactor-parse-dags
Refactor Parse Dags
- Loading branch information
Showing
13 changed files
with
220 additions
and
88 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
79 changes: 79 additions & 0 deletions
79
airflow/dags/polygonetl_airflow/parse/parse_dataset_folder_logic.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
import logging | ||
import time | ||
|
||
from polygonetl_airflow.common import get_list_of_files, read_json_file | ||
from polygonetl_airflow.parse.parse_table_definition_logic import parse, ref_regex | ||
from polygonetl_airflow.parse.toposort import toposort_flatten | ||
|
||
|
||
def parse_dataset_folder( | ||
bigquery_client, | ||
dataset_folder, | ||
ds, | ||
source_project_id, | ||
source_dataset_name, | ||
destination_project_id, | ||
internal_project_id, | ||
sqls_folder, | ||
parse_all_partitions, | ||
time_func=time.time | ||
): | ||
logging.info(f'Parsing dataset folder {dataset_folder}') | ||
json_files = get_list_of_files(dataset_folder, '*.json') | ||
logging.info(json_files) | ||
|
||
topologically_sorted_json_files = topologically_sort_json_files(json_files) | ||
logging.info(f'Topologically sorted json files: {topologically_sorted_json_files}') | ||
|
||
for index, json_file in enumerate(topologically_sorted_json_files): | ||
logging.info(f'Parsing json file {index} out of {len(topologically_sorted_json_files)}: {json_file}') | ||
table_definition = read_json_file(json_file) | ||
parse( | ||
bigquery_client, | ||
table_definition, | ||
ds, | ||
source_project_id, | ||
source_dataset_name, | ||
destination_project_id, | ||
internal_project_id, | ||
sqls_folder, | ||
parse_all_partitions, | ||
time_func=time_func | ||
) | ||
|
||
|
||
def topologically_sort_json_files(json_files): | ||
table_name_to_file_map = {} | ||
dependencies = {} | ||
|
||
for json_file in json_files: | ||
table_definition = read_json_file(json_file) | ||
|
||
contract_address = table_definition['parser']['contract_address'] | ||
|
||
ref_dependencies = ref_regex.findall(contract_address) if contract_address is not None else None | ||
|
||
table_name = get_table_name_from_json_file_name(json_file) | ||
|
||
dependencies[table_name] = set(ref_dependencies) if ref_dependencies is not None else set() | ||
table_name_to_file_map[table_name] = json_file | ||
|
||
validate_dependencies(dependencies, table_name_to_file_map.keys()) | ||
logging.info(f'Table definition dependencies: {dependencies}') | ||
|
||
# TODO: Use toposort() instead of toposort_flatten() so that independent tables could be run in parallel | ||
sorted_tables = list(toposort_flatten(dependencies)) | ||
|
||
topologically_sorted_json_files = [table_name_to_file_map[table_name] for table_name in sorted_tables] | ||
return topologically_sorted_json_files | ||
|
||
|
||
def validate_dependencies(dependencies, table_names): | ||
for deps in dependencies.values(): | ||
for dep_table_name in deps: | ||
if dep_table_name not in table_names: | ||
raise ValueError(f'Dependency {dep_table_name} not found. Check ref() in table definitions') | ||
|
||
|
||
def get_table_name_from_json_file_name(json_file_name): | ||
return json_file_name.split('/')[-1].replace('.json', '') |
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
####################################################################### | ||
# Implements a topological sort algorithm. | ||
# | ||
# Copyright 2014-2021 True Blade Systems, Inc. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# | ||
# Notes: | ||
# Based on http://code.activestate.com/recipes/578272-topological-sort | ||
# with these major changes: | ||
# Added unittests. | ||
# Deleted doctests (maybe not the best idea in the world, but it cleans | ||
# up the docstring). | ||
# Moved functools import to the top of the file. | ||
# Changed assert to a ValueError. | ||
# Changed iter[items|keys] to [items|keys], for python 3 | ||
# compatibility. I don't think it matters for python 2 these are | ||
# now lists instead of iterables. | ||
# Copy the input so as to leave it unmodified. | ||
# Renamed function from toposort2 to toposort. | ||
# Handle empty input. | ||
# Switch tests to use set literals. | ||
# | ||
######################################################################## | ||
|
||
from functools import reduce as _reduce | ||
|
||
__all__ = ["toposort", "toposort_flatten", "CircularDependencyError"] | ||
__version__ = "1.7" | ||
|
||
|
||
class CircularDependencyError(ValueError): | ||
def __init__(self, data): | ||
# Sort the data just to make the output consistent, for use in | ||
# error messages. That's convenient for doctests. | ||
s = "Circular dependencies exist among these items: {{{}}}".format( | ||
", ".join( | ||
"{!r}:{!r}".format(key, value) for key, value in sorted(data.items()) | ||
) | ||
) | ||
super(CircularDependencyError, self).__init__(s) | ||
self.data = data | ||
|
||
|
||
def toposort(data): | ||
"""\ | ||
Dependencies are expressed as a dictionary whose keys are items | ||
and whose values are a set of dependent items. Output is a list of | ||
sets in topological order. The first set consists of items with no | ||
dependences, each subsequent set consists of items that depend upon | ||
items in the preceeding sets.""" | ||
|
||
# Special case empty input. | ||
if len(data) == 0: | ||
return | ||
|
||
# Copy the input so as to leave it unmodified. | ||
# Discard self-dependencies and copy two levels deep. | ||
data = {item: set(e for e in dep if e != item) for item, dep in data.items()} | ||
|
||
# Find all items that don't depend on anything. | ||
extra_items_in_deps = _reduce(set.union, data.values()) - set(data.keys()) | ||
# Add empty dependences where needed. | ||
data.update({item: set() for item in extra_items_in_deps}) | ||
while True: | ||
ordered = set(item for item, dep in data.items() if len(dep) == 0) | ||
if not ordered: | ||
break | ||
yield ordered | ||
data = { | ||
item: (dep - ordered) for item, dep in data.items() if item not in ordered | ||
} | ||
if len(data) != 0: | ||
raise CircularDependencyError(data) | ||
|
||
|
||
def toposort_flatten(data, sort=True): | ||
"""\ | ||
Returns a single list of dependencies. For any set returned by | ||
toposort(), those items are sorted and appended to the result (just to | ||
make the results deterministic).""" | ||
|
||
result = [] | ||
for d in toposort(data): | ||
result.extend((sorted if sort else list)(d)) | ||
return result |
Oops, something went wrong.