diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py index 4a2ef3b0..f338ad2b 100644 --- a/benchmark/benchmark.py +++ b/benchmark/benchmark.py @@ -5,17 +5,7 @@ from fastavro import writer, reader from fastavro._timezone import utc -try: - from fastavro.validate import validate, validate_many -except ImportError: - try: - from fastavro._write import validate - except ImportError: - from fastavro._write_py import validate - - - def validate_many(records, schema): - return all([validate(record, schema) for record in records]) +from fastavro.validation import validate, validate_many def write(schema, records, runs=1): diff --git a/docs/index.rst b/docs/index.rst index fea14ea3..f285bb22 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -68,6 +68,7 @@ Documentation reader writer + validation command_line_script * :ref:`genindex` diff --git a/docs/validation.rst b/docs/validation.rst new file mode 100644 index 00000000..40488a68 --- /dev/null +++ b/docs/validation.rst @@ -0,0 +1,6 @@ +fastavro.validation +=================== + +.. autofunction:: fastavro._validation_py.validate + +.. autofunction:: fastavro._validation_py.validate_many diff --git a/fastavro/__init__.py b/fastavro/__init__.py index 20dc57e6..e67dcf25 100644 --- a/fastavro/__init__.py +++ b/fastavro/__init__.py @@ -47,7 +47,7 @@ import fastavro.read import fastavro.write import fastavro.schema -import fastavro.validate +import fastavro.validation def _acquaint_schema(schema): @@ -71,7 +71,7 @@ def _acquaint_schema(schema): acquaint_schema = _acquaint_schema fastavro.schema.acquaint_schema = _acquaint_schema is_avro = fastavro.read.is_avro -validator = fastavro.validate.validate +validate = fastavro.validation.validate __all__ = [ n for n in locals().keys() if not n.startswith('_') diff --git a/fastavro/_schema.pyx b/fastavro/_schema.pyx index 1c9ee928..9acdec0c 100644 --- a/fastavro/_schema.pyx +++ b/fastavro/_schema.pyx @@ -28,7 +28,7 @@ cpdef inline str extract_logical_type(schema): return None -def schema_name(object schema, parent_ns): +def schema_name(schema, parent_ns): name = schema.get('name') if not name: return parent_ns, None diff --git a/fastavro/_validate.pyx b/fastavro/_validate.pyx deleted file mode 100644 index e87c3df4..00000000 --- a/fastavro/_validate.pyx +++ /dev/null @@ -1,419 +0,0 @@ -import datetime -import decimal -import numbers -from collections import Iterable, Mapping - -from . import const -from ._six import long, is_str, iterkeys, itervalues -from ._schema import extract_record_type, schema_name -from ._schema_common import SCHEMA_DEFS, UnknownType -from ._validate_common import ValidationError, ValidationErrorData - -ctypedef int int32 -ctypedef unsigned int uint32 -ctypedef unsigned long long ulong64 -ctypedef long long long64 - -cdef int32 INT_MIN_VALUE = const.INT_MIN_VALUE -cdef int32 INT_MAX_VALUE = const.INT_MAX_VALUE -cdef long64 LONG_MIN_VALUE = const.LONG_MIN_VALUE -cdef long64 LONG_MAX_VALUE = const.LONG_MAX_VALUE - -cpdef bint validate_null(datum, schema=None, - str parent_ns='', bint raise_errors=False): - """ - Checks that the data value is None. - - :param datum: data to validate as None - :param raise_errors: not used - :param parent_ns: not used - :param schema: not used - :return: bool - """ - return datum is None - -cpdef bint validate_boolean(datum, schema=None, - str parent_ns='', bint raise_errors=False): - """Check that the data value is bool instance - - :param datum: data to validate as boolean - :param raise_errors: not used - :param parent_ns: not used - :param schema: not used - :return: bool - """ - return isinstance(datum, bool) - -cpdef bint validate_string(datum, schema=None, - str parent_ns='', bint raise_errors=False): - """Check that the data value is string type, uses - six for Python version compatibility. - - :param datum: data to validate as string - :param raise_errors: not used - :param parent_ns: not used - :param schema: not used - :return: bool - """ - return is_str(datum) - -cpdef bint validate_bytes(datum, schema=None, - str parent_ns='', bint raise_errors=False): - """ - Check that the data value is - (bytes or decimal.Decimal) - - :param datum: data to validate as bytes - :param raise_errors: not used - :param parent_ns: not used - :param schema: not used - :return: bool - """ - return isinstance(datum, (bytes, decimal.Decimal)) - -cpdef bint validate_int(datum, schema=None, - str parent_ns='', bint raise_errors=False): - """ - Check that the data value is a non floating - point number with size less that Int32. - - Also support for logicalType timestamp validation with datetime. - - Int32 = -2147483648<=datum<=2147483647 - - :param datum: (int, long, numbers.Integral, - datetime.time, datetime.datetime, datetime.date) - :param raise_errors: not used - :param parent_ns: not used - :param schema: not used - :return: bool - """ - return ( - (isinstance(datum, (int, long, numbers.Integral)) and - INT_MIN_VALUE <= datum <= INT_MAX_VALUE) or - isinstance(datum, ( - datetime.time, datetime.datetime, datetime.date)) - ) - -cpdef bint validate_long(datum, schema=None, - str parent_ns='', bint raise_errors=False): - """ - Check that the data value is a non floating - point number with size less that Int32. - - Also support for logicalType timestamp validation with datetime. - - Int64 = -9223372036854775808 <= datum <= 9223372036854775807 - - :param datum: number: data to validate as long64 - :param raise_errors: not used - :param parent_ns: not used - :param schema: not used - :return: bool - """ - return ( - (isinstance(datum, (int, long, numbers.Integral)) and - LONG_MIN_VALUE <= datum <= LONG_MAX_VALUE) or - isinstance(datum, ( - datetime.time, datetime.datetime, datetime.date)) - ) - -cpdef bint validate_float(datum, schema=None, - str parent_ns='', bint raise_errors=False): - """ - Check that the data value is a floating - point number or double precision. - - :param datum: number: data to validate as float - :param raise_errors: not used - :param parent_ns: not used - :param schema: not used - :return: bool - """ - return isinstance(datum, (int, long, float, numbers.Real)) - -cpdef bint validate_fixed(datum, dict schema, - str parent_ns='', bint raise_errors=False): - """ - Check that the data value is fixed width bytes, - matching the schema['size'] exactly! - - :param datum: (bytes, decimal.Decimal) - :param schema: avro schema of 'fixed' type - :param parent_ns: not used - :param raise_errors: not used - :return: bool - """ - return (isinstance(datum, bytes) and - len(datum) == schema['size']) or \ - (isinstance(datum, decimal.Decimal)) - -cpdef bint validate_enum(datum, dict schema, - str parent_ns='', bint raise_errors=False): - """ - Check that the data value matches one of the enum symbols. - - i.e "blue" in ["red", green", "blue"] - - :param datum: str: data to validate in enum symbols - :param schema: avro schema of 'enum' type - :param parent_ns: not used - :param raise_errors: not used - :return: bool - """ - return datum in schema['symbols'] - -cpdef bint validate_array(datum, dict schema, - str parent_ns='', bint raise_errors=False): - """ - Check that the data list values all match schema['items']. - - :param datum: list: data to validate as specified "items" type - :param schema: avro schema of 'array' type - :param parent_ns: str: parent namespace - :param raise_errors: bool: should raise ValidationError - :return: bool - :except: ValidationError - """ - if not isinstance(datum, Iterable) or is_str(datum): - return False - - if raise_errors: - namespace, name = schema_name(schema, parent_ns) - else: - name = parent_ns - for d in datum: - if not validate(datum=d, schema=schema['items'], - field=name, - raise_errors=raise_errors): - return False - return True - -cpdef bint validate_map(object datum, dict schema, str parent_ns='', - bint raise_errors=False): - """ - Check that the data is a Map(k,v) - matching values to schema['values'] type. - - :param datum: Mapping: data to validate as specified "items" type - :param schema: avro schema of 'map' type - :param parent_ns: str: parent namespace - :param raise_errors: bool: should raise ValidationError - :return: bool - :except: ValidationError - """ - # initial checks for map type - if not isinstance(datum, Mapping): - return False - for k in iterkeys(datum): - if not is_str(k): - return False - - if raise_errors: - namespace, name = schema_name(schema, parent_ns) - else: - name = parent_ns - for v in itervalues(datum): - if not validate(datum=v, schema=schema['values'], - field=name, - raise_errors=raise_errors): - return False - return True - -cpdef bint validate_record(object datum, dict schema, str parent_ns='', - bint raise_errors=False): - """ - Check that the data is a Mapping type with all schema defined fields - validated as True. - - :param datum: Mapping: data to validate schema fields - :param schema: avro schema of 'record' type - :param parent_ns: str: parent namespace - :param raise_errors: bool: should raise ValidationError - :return: bool - :except: ValidationError - """ - if not isinstance(datum, Mapping): - return False - if raise_errors: - namespace, name = schema_name(schema, parent_ns) - else: - name = parent_ns - for f in schema['fields']: - if not validate(datum=datum.get(f['name'], f.get('default')), - schema=f['type'], - field=schema_name(f, name)[1] if raise_errors else name, - raise_errors=raise_errors): - return False - return True - -cpdef bint validate_union(object datum, list schema, str parent_ns=None, - bint raise_errors=False): - """ - Check that the data is a list type with possible options to - validate as True. - - :param datum: (Iterable, tuple(name, Iterable)): data to validate - as multiple data types - :param schema: avro schema of 'union' type - :param parent_ns: str: parent namespace - :param raise_errors: bool: should raise ValidationError - :return: bool - :except: ValidationError - """ - if isinstance(datum, tuple): - (name, datum) = datum - for candidate in schema: - if extract_record_type(candidate) == 'record': - if name == candidate["name"]: - return validate(datum, schema=candidate, - field=parent_ns, - raise_errors=raise_errors) - else: - return False - - cdef list errors = [] - for s in schema: - try: - ret = validate(datum, schema=s, - field=parent_ns, - raise_errors=raise_errors) - if ret: - # We exit on the first passing type in Unions - return True - except ValidationError as e: - errors.extend(e.errors) - if raise_errors: - raise ValidationError(*errors) - return False - -cpdef BASE_VALIDATORS = { - 'null': validate_null, - 'boolean': validate_boolean, - 'string': validate_string, - 'int': validate_int, - 'long': validate_long, - 'float': validate_float, - 'double': validate_float, - 'bytes': validate_bytes, - 'fixed': validate_fixed, - 'enum': validate_enum, - 'array': validate_array, - 'map': validate_map, - 'union': validate_union, - 'error_union': validate_union, - 'record': validate_record, - 'error': validate_record, - 'request': validate_record -} - -cpdef VALIDATORS = BASE_VALIDATORS.copy() - -cpdef void register_validator(record_type, validator): - if record_type in BASE_VALIDATORS: - raise ValueError("Not allowed to override Base Validators.") - VALIDATORS[record_type] = validator - -cpdef get_validator(record_type): - return VALIDATORS.get(record_type) - -cpdef validate(object datum, object schema, str field='', - bint raise_errors=False): - """Determine if a python datum is an instance of a schema.""" - record_type = extract_record_type(schema) - result = None - ns_field = '' - - if hasattr(schema, 'get') and raise_errors: - parent_ns, ns_field = schema_name(schema, None) - elif field: - ns_field = field - - # explicit, so that cython is faster, but only for Base Validators - if record_type == 'null': - result = validate_null(datum, schema=schema, parent_ns=ns_field, - raise_errors=raise_errors) - elif record_type == 'boolean': - result = validate_boolean(datum, schema=schema, parent_ns=ns_field, - raise_errors=raise_errors) - elif record_type == 'string': - result = validate_string(datum, schema=schema, parent_ns=ns_field, - raise_errors=raise_errors) - elif record_type == 'int': - result = validate_int(datum, schema=schema, parent_ns=ns_field, - raise_errors=raise_errors) - elif record_type == 'long': - result = validate_long(datum, schema=schema, parent_ns=ns_field, - raise_errors=raise_errors) - elif record_type in ('float', 'double'): - result = validate_float(datum, schema=schema, parent_ns=ns_field, - raise_errors=raise_errors) - elif record_type == 'bytes': - result = validate_bytes(datum, schema=schema, parent_ns=ns_field, - raise_errors=raise_errors) - elif record_type == 'fixed': - result = validate_fixed(datum, schema=schema, parent_ns=ns_field, - raise_errors=raise_errors) - elif record_type == 'enum': - result = validate_enum(datum, schema=schema, parent_ns=ns_field, - raise_errors=raise_errors) - elif record_type == 'array': - result = validate_array(datum, schema=schema, parent_ns=ns_field, - raise_errors=raise_errors) - elif record_type == 'map': - result = validate_map(datum, schema=schema, parent_ns=ns_field, - raise_errors=raise_errors) - elif record_type in ('union', 'error_union'): - result = validate_union(datum, schema=schema, parent_ns=ns_field, - raise_errors=raise_errors) - elif record_type in ('record', 'error', 'request'): - result = validate_record(datum, schema=schema, parent_ns=ns_field, - raise_errors=raise_errors) - else: - validator = get_validator(record_type) - if validator: - result = validator(datum, schema=schema, parent_ns=ns_field, - raise_errors=raise_errors) - - if record_type in SCHEMA_DEFS and result is None: - result = validate(datum, - schema=SCHEMA_DEFS[record_type], - field=ns_field, - raise_errors=raise_errors) - - if raise_errors and result is False: - raise ValidationError(ValidationErrorData(datum, schema, ns_field)) - - if result is None: - raise UnknownType(record_type) - - return bool(result) - - -cpdef validate_many(records, schema, bint raise_errors=False, int stop_count=-1): - """ - Validate a list of data! - - :param records: Iterable: list of records to validate - :param schema: Avro schema - :param raise_errors: bool: should raise ValidationError - :param stop_count: int: stop early for error count - :return: bool - :except: ValidationError - """ - cdef int error_count = 0 - cdef bint result - cdef list errors = [] - cdef list results = [] - for record in records: - try: - result = validate(record, schema, raise_errors=raise_errors) - results.append(result) - except ValidationError as e: - error_count += 1 - errors.extend(e.errors) - if error_count >= stop_count: - break - if raise_errors: - raise ValidationError(*errors) - return all(results) diff --git a/fastavro/_validate_common.py b/fastavro/_validate_common.py index db0ce039..86c6e503 100644 --- a/fastavro/_validate_common.py +++ b/fastavro/_validate_common.py @@ -9,10 +9,10 @@ def __str__(self): self.field = '' if self.datum is None: - return 'Field({field}) is null' \ + return 'Field({field}) is None' \ ' expected {schema}'.format(field=self.field, schema=self.schema) - return '{field} is {datum} of type ' \ + return '{field} is <{datum}> of type ' \ '{given_type} expected {schema}'. \ format(datum=self.datum, given_type=type(self.datum), schema=self.schema, field=self.field) diff --git a/fastavro/_validation.pyx b/fastavro/_validation.pyx new file mode 100644 index 00000000..b6c2657a --- /dev/null +++ b/fastavro/_validation.pyx @@ -0,0 +1,240 @@ +import datetime +import decimal +import numbers +from collections import Iterable, Mapping + +from . import const +from ._six import long, is_str, iterkeys, itervalues +from ._schema import extract_record_type, schema_name +from ._schema_common import SCHEMA_DEFS, UnknownType +from ._validate_common import ValidationError, ValidationErrorData + +ctypedef int int32 +ctypedef unsigned int uint32 +ctypedef unsigned long long ulong64 +ctypedef long long long64 + +cdef int32 INT_MIN_VALUE = const.INT_MIN_VALUE +cdef int32 INT_MAX_VALUE = const.INT_MAX_VALUE +cdef long64 LONG_MIN_VALUE = const.LONG_MIN_VALUE +cdef long64 LONG_MAX_VALUE = const.LONG_MAX_VALUE + +cdef inline bint validate_null(datum, schema=None, + str parent_ns='', bint raise_errors=True): + return datum is None + + +cdef inline bint validate_boolean(datum, schema=None, + str parent_ns='', bint raise_errors=True): + return isinstance(datum, bool) + + +cdef inline bint validate_string(datum, schema=None, + str parent_ns='', bint raise_errors=True): + return is_str(datum) + + +cdef inline bint validate_bytes(datum, schema=None, + str parent_ns='', bint raise_errors=True): + return isinstance(datum, (bytes, decimal.Decimal)) + + +cdef inline bint validate_int(datum, schema=None, + str parent_ns='', bint raise_errors=True): + return ( + (isinstance(datum, (int, long, numbers.Integral)) and + INT_MIN_VALUE <= datum <= INT_MAX_VALUE) or + isinstance(datum, ( + datetime.time, datetime.datetime, datetime.date)) + ) + + +cdef inline bint validate_long(datum, schema=None, + str parent_ns='', bint raise_errors=True): + return ( + (isinstance(datum, (int, long, numbers.Integral)) and + LONG_MIN_VALUE <= datum <= LONG_MAX_VALUE) or + isinstance(datum, ( + datetime.time, datetime.datetime, datetime.date)) + ) + + +cdef inline bint validate_float(datum, schema=None, + str parent_ns='', bint raise_errors=True): + return isinstance(datum, (int, long, float, numbers.Real)) + + +cdef inline bint validate_fixed(datum, dict schema, + str parent_ns='', bint raise_errors=True): + return (isinstance(datum, bytes) and + len(datum) == schema['size']) or \ + (isinstance(datum, decimal.Decimal)) + + +cdef inline bint validate_enum(datum, dict schema, + str parent_ns='', bint raise_errors=True): + return datum in schema['symbols'] + + +cdef inline bint validate_array(datum, dict schema, + str parent_ns='', bint raise_errors=True) except -1: + if not isinstance(datum, Iterable) or is_str(datum): + return False + + if raise_errors: + namespace, name = schema_name(schema, parent_ns) + else: + name = parent_ns + for d in datum: + if not validate(datum=d, schema=schema['items'], + field=name, + raise_errors=raise_errors): + return False + return True + + +cdef inline bint validate_map(object datum, dict schema, str parent_ns='', + bint raise_errors=True) except -1: + # initial checks for map type + if not isinstance(datum, Mapping): + return False + for k in iterkeys(datum): + if not is_str(k): + return False + + if raise_errors: + namespace, name = schema_name(schema, parent_ns) + else: + name = parent_ns + for v in itervalues(datum): + if not validate(datum=v, schema=schema['values'], + field=name, + raise_errors=raise_errors): + return False + return True + + +cdef inline bint validate_record(object datum, dict schema, str parent_ns='', + bint raise_errors=True) except -1: + if not isinstance(datum, Mapping): + return False + if raise_errors: + namespace, name = schema_name(schema, parent_ns) + else: + name = parent_ns + for f in schema['fields']: + if not validate(datum=datum.get(f['name'], f.get('default')), + schema=f['type'], + field=schema_name(f, name)[1] if raise_errors else name, + raise_errors=raise_errors): + return False + return True + + +cdef inline bint validate_union(object datum, list schema, str parent_ns=None, + bint raise_errors=True) except -1: + if isinstance(datum, tuple): + (name, datum) = datum + for candidate in schema: + if extract_record_type(candidate) == 'record': + if name == candidate["name"]: + return validate(datum, schema=candidate, + field=parent_ns, + raise_errors=raise_errors) + else: + return False + + cdef list errors = [] + for s in schema: + try: + ret = validate(datum, schema=s, + field=parent_ns, + raise_errors=raise_errors) + if ret: + # We exit on the first passing type in Unions + return True + except ValidationError as e: + errors.extend(e.errors) + if raise_errors: + raise ValidationError(*errors) + return False + + +cpdef validate(object datum, object schema, str field='', + bint raise_errors=True): + record_type = extract_record_type(schema) + result = None + ns_field = '' + + if hasattr(schema, 'get') and raise_errors: + parent_ns, ns_field = schema_name(schema, None) + elif field: + ns_field = field + + # explicit, so that cython is faster, but only for Base Validators + if record_type == 'null': + result = validate_null(datum, schema=schema, parent_ns=ns_field, + raise_errors=raise_errors) + elif record_type == 'boolean': + result = validate_boolean(datum, schema=schema, parent_ns=ns_field, + raise_errors=raise_errors) + elif record_type == 'string': + result = validate_string(datum, schema=schema, parent_ns=ns_field, + raise_errors=raise_errors) + elif record_type == 'int': + result = validate_int(datum, schema=schema, parent_ns=ns_field, + raise_errors=raise_errors) + elif record_type == 'long': + result = validate_long(datum, schema=schema, parent_ns=ns_field, + raise_errors=raise_errors) + elif record_type in ('float', 'double'): + result = validate_float(datum, schema=schema, parent_ns=ns_field, + raise_errors=raise_errors) + elif record_type == 'bytes': + result = validate_bytes(datum, schema=schema, parent_ns=ns_field, + raise_errors=raise_errors) + elif record_type == 'fixed': + result = validate_fixed(datum, schema=schema, parent_ns=ns_field, + raise_errors=raise_errors) + elif record_type == 'enum': + result = validate_enum(datum, schema=schema, parent_ns=ns_field, + raise_errors=raise_errors) + elif record_type == 'array': + result = validate_array(datum, schema=schema, parent_ns=ns_field, + raise_errors=raise_errors) + elif record_type == 'map': + result = validate_map(datum, schema=schema, parent_ns=ns_field, + raise_errors=raise_errors) + elif record_type in ('union', 'error_union'): + result = validate_union(datum, schema=schema, parent_ns=ns_field, + raise_errors=raise_errors) + elif record_type in ('record', 'error', 'request'): + result = validate_record(datum, schema=schema, parent_ns=ns_field, + raise_errors=raise_errors) + elif record_type in SCHEMA_DEFS: + result = validate(datum, + schema=SCHEMA_DEFS[record_type], + field=ns_field, + raise_errors=raise_errors) + else: + raise UnknownType(record_type) + + if raise_errors and result is False: + raise ValidationError(ValidationErrorData(datum, schema, ns_field)) + + return bool(result) + + +cpdef validate_many(records, schema, bint raise_errors=True): + cdef bint result + cdef list errors = [] + cdef list results = [] + for record in records: + try: + result = validate(record, schema, raise_errors=raise_errors) + results.append(result) + except ValidationError as e: + errors.extend(e.errors) + if raise_errors: + raise ValidationError(*errors) + return all(results) diff --git a/fastavro/_validate_py.py b/fastavro/_validation_py.py similarity index 66% rename from fastavro/_validate_py.py rename to fastavro/_validation_py.py index ad43d8f2..9328c3a7 100644 --- a/fastavro/_validate_py.py +++ b/fastavro/_validation_py.py @@ -3,8 +3,9 @@ import numbers from collections import Iterable, Mapping -from fastavro.const import INT_MAX_VALUE, INT_MIN_VALUE, \ - LONG_MAX_VALUE, LONG_MIN_VALUE +from fastavro.const import ( + INT_MAX_VALUE, INT_MIN_VALUE, LONG_MAX_VALUE, LONG_MIN_VALUE +) from ._validate_common import ValidationError, ValidationErrorData from .schema import extract_record_type, schema_name, UnknownType from .six import long, is_str, iterkeys, itervalues @@ -15,9 +16,12 @@ def validate_null(datum, **kwargs): """ Checks that the data value is None. - :param datum: None : data to validate as None - :param kwargs: black-hole args - :return: bool + Parameters + ---------- + datum: Any + Data being validated + kwargs: Any + Unused kwargs """ return datum is None @@ -26,9 +30,12 @@ def validate_boolean(datum, **kwargs): """ Check that the data value is bool instance - :param datum: (bool) : data to validate as boolean - :param kwargs: black-hole args - :return: bool + Parameters + ---------- + datum: Any + Data being validated + kwargs: Any + Unused kwargs """ return isinstance(datum, bool) @@ -38,9 +45,12 @@ def validate_string(datum, **kwargs): Check that the data value is string type, uses six for Python version compatibility. - :param datum: (str, basestring, unicode) : data to validate as string - :param kwargs: black-hole args - :return: bool + Parameters + ---------- + datum: Any + Data being validated + kwargs: Any + Unused kwargs """ return is_str(datum) @@ -49,9 +59,12 @@ def validate_bytes(datum, **kwargs): """ Check that the data value is (python bytes type or decimal.Decimal type - :param datum: (bytes, decimal.Decimal): data to validate as bytes - :param kwargs: black-hole args - :return: bool + Parameters + ---------- + datum: Any + Data being validated + kwargs: Any + Unused kwargs """ return isinstance(datum, (bytes, decimal.Decimal)) @@ -68,9 +81,12 @@ def validate_int(datum, **kwargs): (int, long, numbers.Integral, datetime.time, datetime.datetime, datetime.date) - :param datum: number: data to validate as int32 - :param kwargs: black-hole args - :return: bool + Parameters + ---------- + datum: Any + Data being validated + kwargs: Any + Unused kwargs """ return ( (isinstance(datum, (int, long, numbers.Integral)) and @@ -92,9 +108,12 @@ def validate_long(datum, **kwargs): (int, long, numbers.Integral, datetime.time, datetime.datetime, datetime.date) - :param datum: number: data to validate as long64 - :param kwargs: black-hole args - :return: bool + :Parameters + ---------- + datum: Any + Data being validated + kwargs: Any + Unused kwargs """ return ( (isinstance(datum, (int, long, numbers.Integral)) and @@ -112,9 +131,12 @@ def validate_float(datum, **kwargs): conditional python types (int, long, float, numbers.Real) - :param datum: number: data to validate as float - :param kwargs: black-hole args - :return: bool + Parameters + ---------- + datum: Any + Data being validated + kwargs: Any + Unused kwargs """ return isinstance(datum, (int, long, float, numbers.Real)) @@ -124,10 +146,14 @@ def validate_fixed(datum, schema, **kwargs): Check that the data value is fixed width bytes, matching the schema['size'] exactly! - :param datum: (bytes, decimal.Decimal): data to validate as fixed bytes - :param schema: avro schema of 'fixed' type - :param kwargs: black-hole args - :return: bool + Parameters + ---------- + datum: Any + Data being validated + schema: dict + Schema + kwargs: Any + Unused kwargs """ return ( (isinstance(datum, bytes) and len(datum) == schema['size']) @@ -141,27 +167,35 @@ def validate_enum(datum, schema, **kwargs): i.e "blue" in ["red", green", "blue"] - :param datum: str: data to validate in enum symbols - :param schema: avro schema of 'enum' type - :param kwargs: black-hole args - :return: bool + Parameters + ---------- + datum: Any + Data being validated + schema: dict + Schema + kwargs: Any + Unused kwargs """ return datum in schema['symbols'] -def validate_array(datum, schema, parent_ns=None, raise_errors=False): +def validate_array(datum, schema, parent_ns=None, raise_errors=True): """ Check that the data list values all match schema['items']. - :param datum: list: data to validate as specified "items" type - :param schema: avro schema of 'array' type - :param parent_ns: str: parent namespace - :param raise_errors: bool: should raise ValidationError - :return: bool - :except: ValidationError + Parameters + ---------- + datum: Any + Data being validated + schema: dict + Schema + parent_ns: str + parent namespace + raise_errors: bool + If true, raises ValidationError on invalid data """ if raise_errors: - namespace, name = schema_name(schema, parent_ns) + _, name = schema_name(schema, parent_ns) else: name = parent_ns return ( @@ -173,20 +207,24 @@ def validate_array(datum, schema, parent_ns=None, raise_errors=False): ) -def validate_map(datum, schema, parent_ns=None, raise_errors=False): +def validate_map(datum, schema, parent_ns=None, raise_errors=True): """ Check that the data is a Map(k,v) matching values to schema['values'] type. - :param datum: Mapping: data to validate as specified "items" type - :param schema: avro schema of 'map' type - :param parent_ns: str: parent namespace - :param raise_errors: bool: should raise ValidationError - :return: bool - :except: ValidationError + Parameters + ---------- + datum: Any + Data being validated + schema: dict + Schema + parent_ns: str + parent namespace + raise_errors: bool + If true, raises ValidationError on invalid data """ if raise_errors: - namespace, name = schema_name(schema, parent_ns) + _, name = schema_name(schema, parent_ns) else: name = parent_ns return ( @@ -198,17 +236,21 @@ def validate_map(datum, schema, parent_ns=None, raise_errors=False): ) -def validate_record(datum, schema, parent_ns=None, raise_errors=False): +def validate_record(datum, schema, parent_ns=None, raise_errors=True): """ Check that the data is a Mapping type with all schema defined fields validated as True. - :param datum: Mapping: data to validate schema fields - :param schema: avro schema of 'record' type - :param parent_ns: str: parent namespace - :param raise_errors: bool: should raise ValidationError - :return: bool - :except: ValidationError + Parameters + ---------- + datum: Any + Data being validated + schema: dict + Schema + parent_ns: str + parent namespace + raise_errors: bool + If true, raises ValidationError on invalid data """ if raise_errors: namespace, name = schema_name(schema, parent_ns) @@ -225,18 +267,21 @@ def validate_record(datum, schema, parent_ns=None, raise_errors=False): ) -def validate_union(datum, schema, parent_ns=None, raise_errors=False): +def validate_union(datum, schema, parent_ns=None, raise_errors=True): """ Check that the data is a list type with possible options to validate as True. - :param datum: (Iterable, tuple(name, Iterable)): data to validate - as multiple data types - :param schema: avro schema of 'union' type - :param parent_ns: str: parent namespace - :param raise_errors: bool: should raise ValidationError - :return: bool - :except: ValidationError + Parameters + ---------- + datum: Any + Data being validated + schema: dict + Schema + parent_ns: str + parent namespace + raise_errors: bool + If true, raises ValidationError on invalid data """ if isinstance(datum, tuple): (name, datum) = datum @@ -265,7 +310,7 @@ def validate_union(datum, schema, parent_ns=None, raise_errors=False): return False -BASE_VALIDATORS = { +VALIDATORS = { 'null': validate_null, 'boolean': validate_boolean, 'string': validate_string, @@ -285,21 +330,31 @@ def validate_union(datum, schema, parent_ns=None, raise_errors=False): 'request': validate_record } -VALIDATORS = BASE_VALIDATORS.copy() - - -def register_validator(record_type, validator): - if record_type in BASE_VALIDATORS: - raise ValueError("Not allowed to override Base Validators.") - VALIDATORS[record_type] = validator - - -def get_validator(record_type): - return VALIDATORS.get(record_type) - -def validate(datum, schema, field=None, raise_errors=False): - """Determine if a python datum is an instance of a schema.""" +def validate(datum, schema, field=None, raise_errors=True): + """ + Determine if a python datum is an instance of a schema. + + Parameters + ---------- + datum: Any + Data being validated + schema: dict + Schema + field: str, optional + Record field being validated + raise_errors: bool, optional + If true, errors are raised for invalid data. If false, a simple + True (valid) or False (invalid) result is returned + + + Example:: + + from fastavro.validation import validate + schema = {...} + record = {...} + validate(record, schema) + """ record_type = extract_record_type(schema) result = None ns_field = '' @@ -309,7 +364,7 @@ def validate(datum, schema, field=None, raise_errors=False): elif field: ns_field = field - validator = get_validator(record_type) + validator = VALIDATORS.get(record_type) if validator: result = validator(datum, schema=schema, parent_ns=ns_field, @@ -328,28 +383,35 @@ def validate(datum, schema, field=None, raise_errors=False): return result -def validate_many(records, schema, raise_errors=False, stop_count=-1): +def validate_many(records, schema, raise_errors=True): """ Validate a list of data! - :param records: Iterable: list of records to validate - :param schema: Avro schema - :param raise_errors: bool: should raise ValidationError - :param stop_count: int: -1 never stop validation loop - :return: bool - :except: ValidationError + Parameters + ---------- + records: iterable + List of records to validate + schema: dict + Schema + raise_errors: bool, optional + If true, errors are raised for invalid data. If false, a simple + True (valid) or False (invalid) result is returned + + + Example:: + + from fastavro.validation import validate_many + schema = {...} + records = [{...}, {...}, ...] + validate_many(records, schema) """ - error_count = 0 errors = [] results = [] for record in records: try: results.append(validate(record, schema, raise_errors=raise_errors)) except ValidationError as e: - error_count += 1 errors.extend(e.errors) - if error_count == stop_count: - break if raise_errors: raise ValidationError(*errors) return all(results) diff --git a/fastavro/_write.pyx b/fastavro/_write.pyx index d7c5daae..df7c2221 100644 --- a/fastavro/_write.pyx +++ b/fastavro/_write.pyx @@ -16,7 +16,7 @@ from os import urandom from zlib import compress from fastavro import const -from ._validate import validate +from ._validation import validate from ._six import utob, long, iteritems, mk_bits from ._read import HEADER_SCHEMA, SYNC_SIZE, MAGIC from ._schema import ( @@ -427,7 +427,7 @@ cpdef write_union(bytearray fo, datum, schema): best_match_index = -1 most_fields = -1 for index, candidate in enumerate(schema): - if validate(datum, candidate): + if validate(datum, candidate, raise_errors=False): if extract_record_type(candidate) == 'record': fields = len(candidate['fields']) if fields > most_fields: diff --git a/fastavro/_write_py.py b/fastavro/_write_py.py index 5cbde5ab..2b32261a 100644 --- a/fastavro/_write_py.py +++ b/fastavro/_write_py.py @@ -17,7 +17,7 @@ from struct import pack from zlib import compress -from .validate import validate +from .validation import validate from .const import ( MCS_PER_HOUR, MCS_PER_MINUTE, MCS_PER_SECOND, MLS_PER_HOUR, MLS_PER_MINUTE, MLS_PER_SECOND, DAYS_SHIFT @@ -327,7 +327,7 @@ def write_union(fo, datum, schema): best_match_index = -1 most_fields = -1 for index, candidate in enumerate(schema): - if validate(datum, candidate): + if validate(datum, candidate, raise_errors=False): if extract_record_type(candidate) == 'record': fields = len(candidate['fields']) if fields > most_fields: @@ -553,7 +553,7 @@ def writer(fo, Header metadata validator: None, True or a function Validator function. If None (the default) - no validation. If True then - then fastavro.writer.validate will be used. If it's a function, it + then fastavro.validation.validate will be used. If it's a function, it should have the same signature as fastavro.writer.validate and raise an exeption on error. diff --git a/fastavro/validate.py b/fastavro/validate.py deleted file mode 100644 index 43863c72..00000000 --- a/fastavro/validate.py +++ /dev/null @@ -1,42 +0,0 @@ -'''Fast Avro file iteration. - -Example usage:: - - # Validating - from fastavro import validator - - schema = { - 'doc': 'A weather reading.', - 'name': 'Weather', - 'namespace': 'test', - 'type': 'record', - 'fields': [ - {'name': 'station', 'type': 'string'}, - {'name': 'time', 'type': 'long'}, - {'name': 'temp', 'type': 'int'}, - ], - } - - # 'records' can be an iterable (including generator) - records = [ - {u'station': u'011990-99999', u'temp': 0, u'time': 1433269388}, - {u'station': u'011990-99999', u'temp': 22, u'time': 1433270389}, - {u'station': u'011990-99999', u'temp': -11, u'time': 1433273379}, - {u'station': u'012650-99999', u'temp': 111, u'time': 1433275478}, - ] - - writer(out, schema, records) -''' -try: - from . import _validate -except ImportError: - from . import _validate_py as _validate -from ._validate_common import ValidationErrorData, ValidationError - -validate = _validate.validate -register_validator = _validate.register_validator -get_validator = _validate.get_validator -validate_many = _validate.validate_many - -__all__ = ['ValidationError', 'ValidationErrorData', 'validate', - 'validate_many', 'register_validator', 'get_validator'] diff --git a/fastavro/validation.py b/fastavro/validation.py new file mode 100644 index 00000000..5228e05c --- /dev/null +++ b/fastavro/validation.py @@ -0,0 +1,12 @@ +try: + from . import _validation +except ImportError: + from . import _validation_py as _validation +from ._validate_common import ValidationErrorData, ValidationError + +validate = _validation.validate +validate_many = _validation.validate_many + +__all__ = [ + 'ValidationError', 'ValidationErrorData', 'validate', 'validate_many' +] diff --git a/fastavro/write.py b/fastavro/write.py index cb1890d6..a21b5b5f 100644 --- a/fastavro/write.py +++ b/fastavro/write.py @@ -10,12 +10,11 @@ Writer = _write.Writer schemaless_writer = _write.schemaless_writer write_data = _write.write_data -_validate = _write.validate WRITERS = _write.WRITERS LOGICAL_WRITERS = _write.LOGICAL_WRITERS __all__ = [ 'SCHEMA_DEFS', 'acquaint_schema', 'writer', 'schemaless_writer', - 'write_data', 'WRITERS', 'LOGICAL_WRITERS' + 'write_data', 'WRITERS', 'LOGICAL_WRITERS', ] diff --git a/setup.py b/setup.py index 5b6846f5..2358a465 100644 --- a/setup.py +++ b/setup.py @@ -27,7 +27,7 @@ Extension('fastavro._schema', ["fastavro/_schema" + ext]), Extension('fastavro._six', ["fastavro/_six" + ext]), Extension('fastavro._write', ["fastavro/_write" + ext]), - Extension('fastavro._validate', ["fastavro/_validate" + ext]), + Extension('fastavro._validation', ["fastavro/_validation" + ext]), ] diff --git a/tests/test_validate.py b/tests/test_validate.py deleted file mode 100644 index 7a9bc246..00000000 --- a/tests/test_validate.py +++ /dev/null @@ -1,66 +0,0 @@ -from fastavro.write import _validate - - -def test_validator_numeric(): - for datum, schema in [ - (1, 'int'), - (1, 'long'), - (1.0, 'float'), - (1.0, 'double'), - (1, 'float'), - (1, 'double'), - ]: - assert _validate(datum, schema) - - for datum, schema in [ - (1.0, 'int'), - (1.0, 'long'), - ("1.0", 'float'), - ("1.0", 'double'), - ("1", 'float'), - ("1", 'double'), - ]: - assert not _validate(datum, schema) - # and plenty more to add I suppose - - -def test_validator_numeric_numpy(): - import numpy as np - np_ints = [ - np.int_, - np.intc, - np.intp, - np.int8, - np.int16, - np.int32, - np.int64, - np.uint8, - np.uint16, - np.uint32, - np.uint64, - ] - - np_floats = [ - np.float_, - np.float16, - np.float32, - np.float64, - ] - - schema_ints = ['int', 'long'] - - schema_floats = ['float', 'double'] - - # all these should work - for nptype, schema in zip(np_ints, schema_ints): - assert _validate(nptype(1), schema) - - for nptype, schema in zip(np_ints, schema_floats): - assert _validate(nptype(1), schema) - - for nptype, schema in zip(np_floats, schema_floats): - assert _validate(nptype(1), schema) - - # these shouldn't work - for nptype, schema in zip(np_floats, schema_ints): - assert not _validate(nptype(1), schema) diff --git a/tests/test_validation.py b/tests/test_validation.py index 701d9ae2..36975a07 100644 --- a/tests/test_validation.py +++ b/tests/test_validation.py @@ -1,5 +1,19 @@ -from fastavro.validate import ValidationError, ValidationErrorData, validate_many +from fastavro.validation import ( + ValidationError, + ValidationErrorData, + validate, + validate_many +) import pytest +import numpy as np +import sys + +# In PY2 when you do type(int) you get but in PY3 you get +# +if sys.version_info >= (3, 0): + type_type = 'class' +else: + type_type = 'type' schema = { "fields": [ @@ -45,9 +59,14 @@ def test_validate_string_in_int_raises(): 'integ': 21, }] - with pytest.raises((ValidationError,)): + with pytest.raises(ValidationError) as exc: validation_raise(schema, *records) + for error in exc.value.errors: + expected_type = error.schema + assert expected_type in ['null', 'int'] + assert error.field == 'namespace.missingerror.integ_null' + def test_validate_string_in_int_false(): records = [{ @@ -76,9 +95,15 @@ def test_validate_string_in_int_null_raises(): 'integ_null': 11, 'integ': 'str', }] - with pytest.raises((ValidationError,)): + + with pytest.raises(ValidationError) as exc: validation_raise(schema, *records) + for error in exc.value.errors: + expected_type = error.schema + assert expected_type == 'int' + assert error.field == 'namespace.missingerror.integ' + def test_validate_string_in_int_null_false(): records = [{ @@ -98,9 +123,15 @@ def test_validate_int_in_string_null_raises(): 'integ_null': 21, 'integ': 21, }] - with pytest.raises((ValidationError,)): + + with pytest.raises(ValidationError) as exc: validation_raise(schema, *records) + for error in exc.value.errors: + expected_type = error.schema + assert expected_type in ['string', 'null'] + assert error.field == 'namespace.missingerror.str_null' + def test_validate_int_in_string_null_false(): records = [{ @@ -120,9 +151,14 @@ def test_validate_int_in_string_raises(): 'integ': 21, }] - with pytest.raises((ValidationError,)): + with pytest.raises(ValidationError) as exc: validation_raise(schema, *records) + for error in exc.value.errors: + expected_type = error.schema + assert expected_type == 'string' + assert error.field == 'namespace.missingerror.str' + def test_validate_int_in_string_false(): records = [{ @@ -160,7 +196,74 @@ def test_validate_null_in_string_false(): def test_validate_error_raises(): with pytest.raises(ValidationError): - raise ValidationError( - ValidationErrorData(10, "string", "test1"), - ValidationErrorData(10, "bytes", "test1"), - ValidationErrorData("bad int", "int", "test1.test_obj.test2")) + raise ValidationError() + + error = ValidationErrorData(10, "string", "test1") + msg = "test1 is <10> of type <{} 'int'> expected string".format(type_type) + assert msg in str(error) + + +def test_validator_numeric(): + for datum, schema in [ + (1, 'int'), + (1, 'long'), + (1.0, 'float'), + (1.0, 'double'), + (1, 'float'), + (1, 'double'), + ]: + validate(datum, schema) + + for datum, schema in [ + (1.0, 'int'), + (1.0, 'long'), + ("1.0", 'float'), + ("1.0", 'double'), + ("1", 'float'), + ("1", 'double'), + ]: + with pytest.raises(ValidationError): + validate(datum, schema) + # and plenty more to add I suppose + + +def test_validator_numeric_numpy(): + np_ints = [ + np.int_, + np.intc, + np.intp, + np.int8, + np.int16, + np.int32, + np.int64, + np.uint8, + np.uint16, + np.uint32, + np.uint64, + ] + + np_floats = [ + np.float_, + np.float16, + np.float32, + np.float64, + ] + + schema_ints = ['int', 'long'] + + schema_floats = ['float', 'double'] + + # all these should work + for nptype, schema in zip(np_ints, schema_ints): + validate(nptype(1), schema) + + for nptype, schema in zip(np_ints, schema_floats): + validate(nptype(1), schema) + + for nptype, schema in zip(np_floats, schema_floats): + validate(nptype(1), schema) + + # these shouldn't work + for nptype, schema in zip(np_floats, schema_ints): + with pytest.raises(ValidationError): + validate(nptype(1), schema)